diff --git a/build.xml b/build.xml index ce9ab0c4d5..2086d0c9ae 100644 --- a/build.xml +++ b/build.xml @@ -847,7 +847,6 @@ - @@ -858,6 +857,7 @@ + @@ -1118,6 +1118,11 @@ + + + + + diff --git a/ivy.xml b/ivy.xml index 96c1de844d..ee24bc3672 100644 --- a/ivy.xml +++ b/ivy.xml @@ -76,7 +76,7 @@ - + diff --git a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R index 866766c2c0..d5ee3626f4 100644 --- a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R +++ b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R @@ -12,7 +12,7 @@ if ( onCMDLine ) { inputFileName = args[1] outputPDF = args[2] } else { - inputFileName = "~/Desktop/broadLocal/GATK/unstable/wgs.jobreport.txt" + inputFileName = "Q-26618@gsa4.jobreport.txt" #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt" outputPDF = NA @@ -129,9 +129,11 @@ plotGroup <- function(groupTable) { # as above, but averaging over all iterations groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration") if ( dim(sub)[1] > 1 ) { - sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd)) - textplot(as.data.frame(sum), show.rownames=F) - title(paste("Job summary for", name, "averaging over all iterations"), cex=3) + try({ # need a try here because we will fail to reduce when there's just a single iteration + sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd)) + textplot(as.data.frame(sum), show.rownames=F) + title(paste("Job summary for", name, "averaging over all iterations"), cex=3) + }, silent=T) } } @@ -149,6 +151,35 @@ convertUnits <- function(gatkReportData) { lapply(gatkReportData, convertGroup) } +# +# Plots runtimes by analysis name and exechosts +# +# Useful to understand the performance of analysis jobs by hosts, +# and to debug problematic nodes +# +plotTimeByHost <- function(gatkReportData) { + fields = c("analysisName", "exechosts", "runtime") + + runtimes = data.frame() + for ( report in gatkReportData ) { + runtimes = rbind(runtimes, report[,fields]) + } + + plotMe <- function(name, vis) { + p = ggplot(data=runtimes, aes(x=exechosts, y=runtime, group=exechosts, color=exechosts)) + p = p + facet_grid(analysisName ~ ., scale="free") + p = p + vis() + p = p + xlab("Job execution host") + p = p + opts(title = paste(name, "of job runtimes by analysis name and execution host")) + p = p + ylab(paste("Distribution of runtimes", RUNTIME_UNITS)) + p = p + opts(axis.text.x=theme_text(angle=45, hjust=1, vjust=1)) + print(p) + } + + plotMe("Boxplot", geom_boxplot) + plotMe("Jittered points", geom_jitter) +} + # read the table gatkReportData <- gsa.read.gatkreport(inputFileName) @@ -162,7 +193,9 @@ if ( ! is.na(outputPDF) ) { plotJobsGantt(gatkReportData, T, F) plotJobsGantt(gatkReportData, F, F) plotProgressByTime(gatkReportData) +plotTimeByHost(gatkReportData) for ( group in gatkReportData ) { + print(group) plotGroup(group) } diff --git a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java new file mode 100644 index 0000000000..4b1c7a9994 --- /dev/null +++ b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package net.sf.picard.sam; + +import net.sf.picard.PicardException; + +import java.util.*; +import java.lang.reflect.Constructor; + +import net.sf.samtools.*; +import net.sf.samtools.util.CloseableIterator; + +/** + * Provides an iterator interface for merging multiple underlying iterators into a single + * iterable stream. The underlying iterators/files must all have the same sort order unless + * the requested output format is unsorted, in which case any combination is valid. + */ +public class MergingSamRecordIterator implements CloseableIterator { + private final PriorityQueue pq; + private final SamFileHeaderMerger samHeaderMerger; + private final Collection readers; + private final SAMFileHeader.SortOrder sortOrder; + private final SAMRecordComparator comparator; + + private boolean initialized = false; + private boolean iterationStarted = false; + + /** + * Constructs a new merging iterator with the same set of readers and sort order as + * provided by the header merger parameter. + * @param headerMerger The merged header and contents of readers. + * @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order. + * @deprecated replaced by (SamFileHeaderMerger, Collection, boolean) + */ + public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) { + this(headerMerger, headerMerger.getReaders(), forcePresorted); + } + + /** + * Constructs a new merging iterator with the same set of readers and sort order as + * provided by the header merger parameter. + * @param headerMerger The merged header and contents of readers. + * @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order. + */ + public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection readers, final boolean assumeSorted) { + this.samHeaderMerger = headerMerger; + this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); + this.comparator = getComparator(); + this.readers = readers; + + this.pq = new PriorityQueue(readers.size()); + + for (final SAMFileReader reader : readers) { + if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted && + reader.getFileHeader().getSortOrder() != this.sortOrder){ + throw new PicardException("Files are not compatible with sort order"); + } + } + } + + /** + * Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval, + * rather than iterating over every read in the backing file or stream. + * @param reader Reader to add to the merging iterator. + * @param iterator Iterator traversing over reader contents. + */ + public void addIterator(final SAMFileReader reader, final CloseableIterator iterator) { + if(iterationStarted) + throw new PicardException("Cannot add another iterator; iteration has already begun"); + if(!samHeaderMerger.containsHeader(reader.getFileHeader())) + throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger"); + final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator); + addIfNotEmpty(comparableIterator); + initialized = true; + } + + private void startIterationIfRequired() { + if(initialized) + return; + for(SAMFileReader reader: readers) + addIterator(reader,reader.iterator()); + iterationStarted = true; + } + + /** + * Close down all open iterators. + */ + public void close() { + // Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue. + for(CloseableIterator iterator: pq) + iterator.close(); + } + + /** Returns true if any of the underlying iterators has more records, otherwise false. */ + public boolean hasNext() { + startIterationIfRequired(); + return !this.pq.isEmpty(); + } + + /** Returns the next record from the top most iterator during merging. */ + public SAMRecord next() { + startIterationIfRequired(); + + final ComparableSamRecordIterator iterator = this.pq.poll(); + final SAMRecord record = iterator.next(); + addIfNotEmpty(iterator); + record.setHeader(this.samHeaderMerger.getMergedHeader()); + + // Fix the read group if needs be + if (this.samHeaderMerger.hasReadGroupCollisions()) { + final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID); + if (oldGroupId != null ) { + final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId); + record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId); + } + } + + // Fix the program group if needs be + if (this.samHeaderMerger.hasProgramGroupCollisions()) { + final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID); + if (oldGroupId != null ) { + final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId); + record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId); + } + } + + // Fix up the sequence indexes if needs be + if (this.samHeaderMerger.hasMergedSequenceDictionary()) { + if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex())); + } + + if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex())); + } + } + + return record; + } + + /** + * Adds iterator to priority queue. If the iterator has more records it is added + * otherwise it is closed and not added. + */ + private void addIfNotEmpty(final ComparableSamRecordIterator iterator) { + if (iterator.hasNext()) { + pq.offer(iterator); + } + else { + iterator.close(); + } + } + + /** Unsupported operation. */ + public void remove() { + throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()"); + } + + /** + * Get the right comparator for a given sort order (coordinate, alphabetic). In the + * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive + * ordering. + */ + private SAMRecordComparator getComparator() { + // For unsorted build a fake comparator that compares based on object ID + if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { + return new SAMRecordComparator() { + public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { + return System.identityHashCode(lhs) - System.identityHashCode(rhs); + } + + public int compare(final SAMRecord lhs, final SAMRecord rhs) { + return fileOrderCompare(lhs, rhs); + } + }; + } + if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) { + return new MergedSequenceDictionaryCoordinateOrderComparator(); + } + + // Otherwise try and figure out what kind of comparator to return and build it + return this.sortOrder.getComparatorInstance(); + } + + /** Returns the merged header that the merging iterator is working from. */ + public SAMFileHeader getMergedHeader() { + return this.samHeaderMerger.getMergedHeader(); + } + + /** + * Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged + * sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids + * more copy & paste. + */ + private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator { + + public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { + final int referenceIndex1 = getReferenceIndex(samRecord1); + final int referenceIndex2 = getReferenceIndex(samRecord2); + if (referenceIndex1 != referenceIndex2) { + if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return 1; + } else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return -1; + } else { + return referenceIndex1 - referenceIndex2; + } + } + if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + // Both are unmapped. + return 0; + } + return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); + } + + private int getReferenceIndex(final SAMRecord samRecord) { + if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex()); + } + if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex()); + } + return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + } + } +} diff --git a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java new file mode 100644 index 0000000000..f78cd81dac --- /dev/null +++ b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java @@ -0,0 +1,744 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.picard.sam; + +import java.util.*; + +import net.sf.picard.PicardException; +import net.sf.samtools.AbstractSAMHeaderRecord; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.SequenceUtil; + +/** + * Merges SAMFileHeaders that have the same sequences into a single merged header + * object while providing read group translation for cases where read groups + * clash across input headers. + */ +public class SamFileHeaderMerger { + //Super Header to construct + private final SAMFileHeader mergedHeader; + private Collection readers; + private final Collection headers; + + //Translation of old group ids to new group ids + private final Map> samReadGroupIdTranslation = + new IdentityHashMap>(); + + //the read groups from different files use the same group ids + private boolean hasReadGroupCollisions = false; + + //the program records from different files use the same program record ids + private boolean hasProgramGroupCollisions = false; + + //Translation of old program group ids to new program group ids + private Map> samProgramGroupIdTranslation = + new IdentityHashMap>(); + + private boolean hasMergedSequenceDictionary = false; + + // Translation of old sequence dictionary ids to new dictionary ids + // This is an IdentityHashMap because it can be quite expensive to compute the hashCode for + // large SAMFileHeaders. It is possible that two input files will have identical headers so that + // the regular HashMap would fold them together, but the value stored in each of the two + // Map entries will be the same, so it should not hurt anything. + private final Map> samSeqDictionaryIdTranslationViaHeader = + new IdentityHashMap>(); + + //HeaderRecordFactory that creates SAMReadGroupRecord instances. + private static final HeaderRecordFactory READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory() { + public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) { + return new SAMReadGroupRecord(id, srcReadGroupRecord); + } + }; + + //HeaderRecordFactory that creates SAMProgramRecord instances. + private static final HeaderRecordFactory PROGRAM_RECORD_FACTORY = new HeaderRecordFactory() { + public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) { + return new SAMProgramRecord(id, srcProgramRecord); + } + }; + + //comparator used to sort lists of program group and read group records + private static final Comparator RECORD_ID_COMPARATOR = new Comparator() { + public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) { + return o1.getId().compareTo(o2.getId()); + } + }; + + /** + * Create SAMFileHeader with additional information. Required that sequence dictionaries agree. + * + * @param readers sam file readers to combine + * @param sortOrder sort order new header should have + * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) + */ + public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { + this(readers, sortOrder, false); + } + + /** + * Create SAMFileHeader with additional information. + * + * @param readers sam file readers to combine + * @param sortOrder sort order new header should have + * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that + * all input sequence dictionaries be identical. + * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) + */ + public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) { + this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries); + this.readers = readers; + } + + /** + * Create SAMFileHeader with additional information.. This is the preferred constructor. + * + * @param sortOrder sort order new header should have + * @param headers sam file headers to combine + * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that + * all input sequence dictionaries be identical. + */ + public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection headers, final boolean mergeDictionaries) { + this.headers = headers; + this.mergedHeader = new SAMFileHeader(); + + SAMSequenceDictionary sequenceDictionary; + try { + sequenceDictionary = getSequenceDictionary(headers); + this.hasMergedSequenceDictionary = false; + } + catch (SequenceUtil.SequenceListsDifferException pe) { + if (mergeDictionaries) { + sequenceDictionary = mergeSequenceDictionaries(headers); + this.hasMergedSequenceDictionary = true; + } + else { + throw pe; + } + } + + this.mergedHeader.setSequenceDictionary(sequenceDictionary); + + // Set program that creates input alignments + for (final SAMProgramRecord program : mergeProgramGroups(headers)) { + this.mergedHeader.addProgramRecord(program); + } + + // Set read groups for merged header + final List readGroups = mergeReadGroups(headers); + this.mergedHeader.setReadGroups(readGroups); + this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); + + this.mergedHeader.setSortOrder(sortOrder); + + for (final SAMFileHeader header : headers) { + for (final String comment : header.getComments()) { + this.mergedHeader.addComment(comment); + } + } + } + + // Utilility method to make use with old constructor + private static List getHeadersFromReaders(Collection readers) { + List headers = new ArrayList(readers.size()); + for (SAMFileReader reader : readers) { + headers.add(reader.getFileHeader()); + } + return headers; + } + + + /** + * Checks to see if there are clashes where different readers are using the same read + * group IDs. If yes, then those IDs that collided are remapped. + * + * @param headers headers to combine + * @return new list of read groups constructed from all the readers + */ + private List mergeReadGroups(final Collection headers) { + //prepare args for mergeHeaderRecords(..) call + final HashSet idsThatAreAlreadyTaken = new HashSet(); + + final List> readGroupsToProcess = new LinkedList>(); + for (final SAMFileHeader header : headers) { + for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { + //verify that there are no existing id collisions in this input file + if(!idsThatAreAlreadyTaken.add(readGroup.getId())) + throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")"); + + readGroupsToProcess.add(new HeaderRecordAndFileHeader(readGroup, header)); + } + idsThatAreAlreadyTaken.clear(); + } + + final List result = new LinkedList(); + + hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result); + + //sort the result list by record id + Collections.sort(result, RECORD_ID_COMPARATOR); + + return result; + } + + + /** + * Checks to see if there are clashes where different readers are using the same program + * group IDs. If yes, then those IDs that collided are remapped. + * + * @param headers headers to combine + * @return new list of program groups constructed from all the readers + */ + private List mergeProgramGroups(final Collection headers) { + + final List overallResult = new LinkedList(); + + //this Set will accumulate all SAMProgramRecord ids that have been encountered so far. + final HashSet idsThatAreAlreadyTaken = new HashSet(); + + //need to process all program groups + List> programGroupsLeftToProcess = new LinkedList>(); + for (final SAMFileHeader header : headers) { + for (final SAMProgramRecord programGroup : header.getProgramRecords()) { + //verify that there are no existing id collisions in this input file + if(!idsThatAreAlreadyTaken.add(programGroup.getId())) + throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")"); + + programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader(programGroup, header)); + } + idsThatAreAlreadyTaken.clear(); + } + + //A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to + //another program group header (lets say ID=1 PN=A) to indicate that the given file was + //processed by program A followed by program B. These PP attributes potentially + //connect headers into one or more tree structures. Merging is done by + //first merging all headers that don't have PP attributes (eg. tree roots), + //then updating and merging all headers whose PPs point to the tree-root headers, + //and so on until all program group headers are processed. + + //currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots). + List< HeaderRecordAndFileHeader > currentProgramGroups = new LinkedList>(); + for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { + final HeaderRecordAndFileHeader pair = programGroupsLeftToProcessIterator.next(); + if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) { + programGroupsLeftToProcessIterator.remove(); + currentProgramGroups.add(pair); + } + } + + //merge currentProgramGroups + while(!currentProgramGroups.isEmpty()) + { + final List currentResult = new LinkedList(); + + hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult); + + //add currentResults to overallResults + overallResult.addAll(currentResult); + + //apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess + currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false); + programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true); + + //find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups), + //and move them to the list of programGroupsToProcessNext. + LinkedList> programGroupsToProcessNext = new LinkedList>(); + for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { + final HeaderRecordAndFileHeader pairLeftToProcess = programGroupsLeftToProcessIterator.next(); + final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); + //find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file) + for(final HeaderRecordAndFileHeader justProcessedPair : currentProgramGroups) { + String idJustProcessed = justProcessedPair.getHeaderRecord().getId(); + if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) { + programGroupsLeftToProcessIterator.remove(); + programGroupsToProcessNext.add(pairLeftToProcess); + break; + } + } + } + + currentProgramGroups = programGroupsToProcessNext; + } + + //verify that all records were processed + if(!programGroupsLeftToProcess.isEmpty()) { + StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n"); + for( final HeaderRecordAndFileHeader pair : programGroupsLeftToProcess ) { + SAMProgramRecord record = pair.getHeaderRecord(); + errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n"); + } + throw new PicardException(errorMsg.toString()); + } + + //sort the result list by record id + Collections.sort(overallResult, RECORD_ID_COMPARATOR); + + return overallResult; + } + + + /** + * Utility method that takes a list of program groups and remaps all their + * ids (including ppIds if requested) using the given idTranslationTable. + * + * NOTE: when remapping, this method creates new SAMProgramRecords and + * doesn't mutate any records in the programGroups list. + * + * @param programGroups The program groups to translate. + * @param idTranslationTable The translation table. + * @param translatePpIds Whether ppIds should be translated as well. + * + * @return The list of translated records. + */ + private List> translateIds( + List> programGroups, + Map> idTranslationTable, + boolean translatePpIds) { + + //go through programGroups and translate any IDs and PPs based on the idTranslationTable. + List> result = new LinkedList>(); + for(final HeaderRecordAndFileHeader pair : programGroups ) { + final SAMProgramRecord record = pair.getHeaderRecord(); + final String id = record.getProgramGroupId(); + final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); + + final SAMFileHeader header = pair.getFileHeader(); + final Map translations = idTranslationTable.get(header); + + //see if one or both ids need to be translated + SAMProgramRecord translatedRecord = null; + if(translations != null) + { + String translatedId = translations.get( id ); + String translatedPpId = translatePpIds ? translations.get( ppId ) : null; + + boolean needToTranslateId = translatedId != null && !translatedId.equals(id); + boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId); + + if(needToTranslateId && needToTranslatePpId) { + translatedRecord = new SAMProgramRecord(translatedId, record); + translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); + } else if(needToTranslateId) { + translatedRecord = new SAMProgramRecord(translatedId, record); + } else if(needToTranslatePpId) { + translatedRecord = new SAMProgramRecord(id, record); + translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); + } + } + + if(translatedRecord != null) { + result.add(new HeaderRecordAndFileHeader(translatedRecord, header)); + } else { + result.add(pair); //keep the original record + } + } + + return result; + } + + + /** + * Utility method for merging a List of AbstractSAMHeaderRecords. If it finds + * records that have identical ids and attributes, it will collapse them + * into one record. If it finds records that have identical ids but + * non-identical attributes, this is treated as a collision. When collision happens, + * the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable. + * + * NOTE: Non-collided records also get recorded in the idTranslationTable as + * old-id to old-id. This way, an idTranslationTable lookup should never return null. + * + * @param headerRecords The header records to merge. + * @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord. + * @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped. + * @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id + * mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as + * old-id to old-id. This way, an idTranslationTable lookup should never return null. + * + * @param result The list of merged header records. + * + * @return True if there were collisions. + */ + private boolean mergeHeaderRecords(final List> headerRecords, HeaderRecordFactory headerRecordFactory, + final HashSet idsThatAreAlreadyTaken, Map> idTranslationTable, List result) { + + //The outer Map bins the header records by their ids. The nested Map further collapses + //header records which, in addition to having the same id, also have identical attributes. + //In other words, each key in the nested map represents one or more + //header records which have both identical ids and identical attributes. The List of + //SAMFileHeaders keeps track of which readers these header record(s) came from. + final Map>> idToRecord = + new HashMap>>(); + + //Populate the idToRecord and seenIds data structures + for (final HeaderRecordAndFileHeader pair : headerRecords) { + final RecordType record = pair.getHeaderRecord(); + final SAMFileHeader header = pair.getFileHeader(); + final String recordId = record.getId(); + Map> recordsWithSameId = idToRecord.get(recordId); + if(recordsWithSameId == null) { + recordsWithSameId = new LinkedHashMap>(); + idToRecord.put(recordId, recordsWithSameId); + } + + List fileHeaders = recordsWithSameId.get(record); + if(fileHeaders == null) { + fileHeaders = new LinkedList(); + recordsWithSameId.put(record, fileHeaders); + } + + fileHeaders.add(header); + } + + //Resolve any collisions between header records by remapping their ids. + boolean hasCollisions = false; + for (final Map.Entry>> entry : idToRecord.entrySet() ) + { + final String recordId = entry.getKey(); + final Map> recordsWithSameId = entry.getValue(); + + + for( Map.Entry> recordWithUniqueAttr : recordsWithSameId.entrySet()) { + final RecordType record = recordWithUniqueAttr.getKey(); + final List fileHeaders = recordWithUniqueAttr.getValue(); + + String newId; + if(!idsThatAreAlreadyTaken.contains(recordId)) { + //don't remap 1st record. If there are more records + //with this id, they will be remapped in the 'else'. + newId = recordId; + idsThatAreAlreadyTaken.add(recordId); + } else { + //there is more than one record with this id. + hasCollisions = true; + + //find a unique newId for this record + int idx=1; + while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++))) + ; + + idsThatAreAlreadyTaken.add( newId ); + } + + for(SAMFileHeader fileHeader : fileHeaders) { + Map readerTranslationTable = idTranslationTable.get(fileHeader); + if(readerTranslationTable == null) { + readerTranslationTable = new HashMap(); + idTranslationTable.put(fileHeader, readerTranslationTable); + } + readerTranslationTable.put(recordId, newId); + } + + result.add( headerRecordFactory.createRecord(newId, record) ); + } + } + + return hasCollisions; + } + + + /** + * Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence + * are different from one another. + * + * @param headers headers to pull sequences from + * @return sequences from files. Each file should have the same sequence + */ + private SAMSequenceDictionary getSequenceDictionary(final Collection headers) { + SAMSequenceDictionary sequences = null; + for (final SAMFileHeader header : headers) { + + if (sequences == null) { + sequences = header.getSequenceDictionary(); + } + else { + final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); + SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences); + } + } + + return sequences; + } + + /** + * Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries. + * + * @param headers headers to pull sequences from + * @return sequences from files. Each file should have the same sequence + */ + private SAMSequenceDictionary mergeSequenceDictionaries(final Collection headers) { + SAMSequenceDictionary sequences = new SAMSequenceDictionary(); + for (final SAMFileHeader header : headers) { + final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); + sequences = mergeSequences(sequences, currentSequences); + } + // second pass, make a map of the original seqeunce id -> new sequence id + createSequenceMapping(headers, sequences); + return sequences; + } + + /** + * They've asked to merge the sequence headers. What we support right now is finding the sequence name superset. + * + * @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals. + * @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict. + * @return A new sequence dictionary that resulting from merging the two inputs. + */ + private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) { + + // a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict. + LinkedList holder = new LinkedList(); + + // Return value will be created from this. + LinkedList resultingDict = new LinkedList(); + for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) { + resultingDict.add(sequenceRecord); + } + + // Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. + int prevloc = -1; + // Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. + SAMSequenceRecord previouslyMerged = null; + + for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) { + // Does it already exist in resultingDict? + int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName()); + if (loc == -1) { + // If doesn't already exist in resultingDict, save it an decide where to insert it later. + holder.add(sequenceRecord.clone()); + } else if (prevloc > loc) { + // If sequenceRecord already exists in resultingDict, but prior to the previous one + // from mergeIntoDict that already existed, cannot merge. + throw new PicardException("Cannot merge sequence dictionaries because sequence " + + sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() + + " are in different orders in two input sequence dictionaries."); + } else { + // Since sequenceRecord already exists in resultingDict, don't need to add it. + // Add in all the sequences prior to it that have been held in holder. + resultingDict.addAll(loc, holder); + // Remember the index of sequenceRecord so can check for merge imcompatibility. + prevloc = loc + holder.size(); + previouslyMerged = sequenceRecord; + holder.clear(); + } + } + // Append anything left in holder. + if (holder.size() != 0) { + resultingDict.addAll(holder); + } + return new SAMSequenceDictionary(resultingDict); + } + + /** + * Find sequence in list. + * @param list List to search for the sequence name. + * @param sequenceName Name to search for. + * @return Index of SAMSequenceRecord with the given name in list, or -1 if not found. + */ + private static int getIndexOfSequenceName(final List list, final String sequenceName) { + for (int i = 0; i < list.size(); ++i) { + if (list.get(i).getSequenceName().equals(sequenceName)) { + return i; + } + } + return -1; + } + + /** + * create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged + * list of sequence id's. + * @param headers the collections of headers. + * @param masterDictionary the superset dictionary we've created. + */ + private void createSequenceMapping(final Collection headers, SAMSequenceDictionary masterDictionary) { + LinkedList resultingDictStr = new LinkedList(); + for (SAMSequenceRecord r : masterDictionary.getSequences()) { + resultingDictStr.add(r.getSequenceName()); + } + for (final SAMFileHeader header : headers) { + Map seqMap = new HashMap(); + SAMSequenceDictionary dict = header.getSequenceDictionary(); + for (SAMSequenceRecord rec : dict.getSequences()) { + seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName())); + } + this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap); + } + } + + + + /** + * Returns the read group id that should be used for the input read and RG id. + * + * @deprecated replaced by getReadGroupId(SAMFileHeader, String) + * */ + public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { + return getReadGroupId(reader.getFileHeader(), originalReadGroupId); + } + + /** Returns the read group id that should be used for the input read and RG id. */ + public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) { + return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId); + } + + /** + * @param reader one of the input files + * @param originalProgramGroupId a program group ID from the above input file + * @return new ID from the merged list of program groups in the output file + * @deprecated replaced by getProgramGroupId(SAMFileHeader, String) + */ + public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) { + return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId); + } + + /** + * @param header one of the input headers + * @param originalProgramGroupId a program group ID from the above input file + * @return new ID from the merged list of program groups in the output file + */ + public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) { + return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId); + } + + /** Returns true if there are read group duplicates within the merged headers. */ + public boolean hasReadGroupCollisions() { + return this.hasReadGroupCollisions; + } + + /** Returns true if there are program group duplicates within the merged headers. */ + public boolean hasProgramGroupCollisions() { + return hasProgramGroupCollisions; + } + + /** @return if we've merged the sequence dictionaries, return true */ + public boolean hasMergedSequenceDictionary() { + return hasMergedSequenceDictionary; + } + + /** Returns the merged header that should be written to any output merged file. */ + public SAMFileHeader getMergedHeader() { + return this.mergedHeader; + } + + /** Returns the collection of readers that this header merger is working with. May return null. + * @deprecated replaced by getHeaders() + */ + public Collection getReaders() { + return this.readers; + } + + /** Returns the collection of readers that this header merger is working with. + */ + public Collection getHeaders() { + return this.headers; + } + + /** + * Tells whether this header merger contains a given SAM file header. Note that header presence + * is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for + * reasons of performance. + * @param header header to check for. + * @return True if the header exists in this HeaderMerger. False otherwise. + */ + boolean containsHeader(SAMFileHeader header) { + for(SAMFileHeader headerMergerHeader: headers) { + if(headerMergerHeader == header) + return true; + } + return false; + } + + /** + * returns the new mapping for a specified reader, given it's old sequence index + * @param reader the reader + * @param oldReferenceSequenceIndex the old sequence (also called reference) index + * @return the new index value + * @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer) + */ + public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) { + return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex); + } + + /** + * Another mechanism for getting the new sequence index, for situations in which the reader is not available. + * Note that if the SAMRecord has already had its header replaced with the merged header, this won't work. + * @param header The original header for the input record in question. + * @param oldReferenceSequenceIndex The original sequence index. + * @return the new index value that is compatible with the merged sequence index. + */ + public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) { + final Map mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header); + if (mapping == null) { + throw new PicardException("No sequence dictionary mapping available for header: " + header); + } + + final Integer newIndex = mapping.get(oldReferenceSequenceIndex); + if (newIndex == null) { + throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header); + } + + return newIndex; + } + + + /** + * Implementations of this interface are used by mergeHeaderRecords(..) to instantiate + * specific subclasses of AbstractSAMHeaderRecord. + */ + private static interface HeaderRecordFactory { + + /** + * Constructs a new instance of RecordType. + * @param id The id of the new record. + * @param srcRecord Except for the id, the new record will be a copy of this source record. + */ + public RecordType createRecord(final String id, RecordType srcRecord); + } + + /** + * Struct that groups together a subclass of AbstractSAMHeaderRecord with the + * SAMFileHeader that it came from. + */ + private static class HeaderRecordAndFileHeader { + private RecordType headerRecord; + private SAMFileHeader samFileHeader; + + public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) { + this.headerRecord = headerRecord; + this.samFileHeader = samFileHeader; + } + + public RecordType getHeaderRecord() { + return headerRecord; + } + public SAMFileHeader getFileHeader() { + return samFileHeader; + } + } +} diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java new file mode 100644 index 0000000000..5005b6265f --- /dev/null +++ b/public/java/src/net/sf/samtools/BAMFileReader.java @@ -0,0 +1,762 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package net.sf.samtools; + + +import net.sf.samtools.util.*; +import net.sf.samtools.SAMFileReader.ValidationStringency; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Internal class for reading and querying BAM files. + */ +class BAMFileReader extends SAMFileReader.ReaderImplementation { + // True if reading from a File rather than an InputStream + private boolean mIsSeekable = false; + + // For converting bytes into other primitive types + private BinaryCodec mStream = null; + + // Underlying compressed data stream. + private final BAMInputStream mInputStream; + private SAMFileHeader mFileHeader = null; + + // Populated if the file is seekable and an index exists + private File mIndexFile; + private BAMIndex mIndex = null; + private long mFirstRecordPointer = 0; + private CloseableIterator mCurrentIterator = null; + + // If true, all SAMRecords are fully decoded as they are read. + private final boolean eagerDecode; + + // For error-checking. + private ValidationStringency mValidationStringency; + + // For creating BAMRecords + private SAMRecordFactory samRecordFactory; + + /** + * Use the caching index reader implementation rather than the disk-hit-per-file model. + */ + private boolean mEnableIndexCaching = false; + + /** + * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. + */ + private boolean mEnableIndexMemoryMapping = true; + + /** + * Add information about the origin (reader and position) to SAM records. + */ + private SAMFileReader mFileReader = null; + + /** + * Prepare to read BAM from a stream (not seekable) + * @param stream source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final InputStream stream, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = false; + mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream); + mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(null); + } + + /** + * Prepare to read BAM from a file (seekable) + * @param file source of bytes. + * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. + * @param validationStringency Controls how to handle invalidate reads or header lines. + */ + BAMFileReader(final File file, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); + if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { + System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + + " is older than BAM " + file.getAbsolutePath()); + } + } + + BAMFileReader(final SeekableStream strm, + final File indexFile, + final boolean eagerDecode, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm), + indexFile, + eagerDecode, + strm.getSource(), + validationStringency, + factory); + } + + private BAMFileReader(final BAMInputStream inputStream, + final File indexFile, + final boolean eagerDecode, + final String source, + final ValidationStringency validationStringency, + final SAMRecordFactory factory) + throws IOException { + mIndexFile = indexFile; + mIsSeekable = true; + mInputStream = inputStream; + mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream)); + this.eagerDecode = eagerDecode; + this.mValidationStringency = validationStringency; + this.samRecordFactory = factory; + readHeader(source); + mFirstRecordPointer = inputStream.getFilePointer(); + } + + /** + * If true, writes the source of every read into the source SAMRecords. + * @param enabled true to write source information into each SAMRecord. + */ + void enableFileSource(final SAMFileReader reader, final boolean enabled) { + this.mFileReader = enabled ? reader : null; + } + + /** + * If true, uses the caching version of the index reader. + * @param enabled true to write source information into each SAMRecord. + */ + public void enableIndexCaching(final boolean enabled) { + if(mIndex != null) + throw new SAMException("Unable to turn on index caching; index file has already been loaded."); + this.mEnableIndexCaching = enabled; + } + + /** + * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). + * This is slower but more scalable when accessing large numbers of BAM files sequentially. + * @param enabled True to use memory mapping, false to use regular I/O. + */ + public void enableIndexMemoryMapping(final boolean enabled) { + if (mIndex != null) { + throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); + } + this.mEnableIndexMemoryMapping = enabled; + } + + @Override void enableCrcChecking(final boolean enabled) { + this.mInputStream.setCheckCrcs(enabled); + } + + @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } + + /** + * @return true if ths is a BAM file, and has an index + */ + public boolean hasIndex() { + return (mIndexFile != null); + } + + /** + * Retrieves the index for the given file type. Ensure that the index is of the specified type. + * @return An index of the given type. + */ + public BAMIndex getIndex() { + if(mIndexFile == null) + throw new SAMException("No index is available for this BAM file."); + if(mIndex == null) + mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) + : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); + return mIndex; + } + + void close() { + if (mStream != null) { + mStream.close(); + } + if (mIndex != null) { + mIndex.close(); + } + mStream = null; + mFileHeader = null; + mIndex = null; + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + /** + * Set error-checking level for subsequent SAMRecord reads. + */ + void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { + this.mValidationStringency = validationStringency; + } + + SAMFileReader.ValidationStringency getValidationStringency() { + return this.mValidationStringency; + } + + /** + * Prepare to iterate through the SAMRecords in file order. + * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, + * that iterator must be closed before getIterator() can be called again. + * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to + * getIterator() begins its iteration where the last one left off. That is the best that can be + * done in that situation. + */ + CloseableIterator getIterator() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + mCurrentIterator = new BAMFileIterator(); + return mCurrentIterator; + } + + @Override + CloseableIterator getIterator(final SAMFileSpan chunks) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!(chunks instanceof BAMFileSpan)) { + throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); + } + + // Create an iterator over the given chunk boundaries. + mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); + return mCurrentIterator; + } + + /** + * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know + * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, + * the rightmost bound is guaranteed to be after the last read in the file. + * @return An unbounded pointer to the first record in the BAM file. + */ + @Override + SAMFileSpan getFilePointerSpanningReads() { + return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); + } + + /** + * Prepare to iterate through the SAMRecords that match the given interval. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. + * A value of zero implies the start of the reference sequence. + * @param end A value of zero implies the end of the reference sequence. + * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval + * specified by start and end. If false, the SAMRecords need only overlap the interval. + * @return Iterator for the matching SAMRecords + */ + CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING); + return mCurrentIterator; + } + + /** + * Prepare to iterate through the SAMRecords with the given alignment start. + * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed + * before calling any of the methods that return an iterator. + * + * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting + * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate + * matches the specified interval. + * + * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect + * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. + * + * @param sequence Reference sequence sought. + * @param start Alignment start sought. + * @return Iterator for the matching SAMRecords. + */ + CloseableIterator queryAlignmentStart(final String sequence, final int start) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT); + return mCurrentIterator; + } + + public CloseableIterator queryUnmapped() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + try { + final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); + if (startOfLastLinearBin != -1) { + mInputStream.seek(startOfLastLinearBin); + } else { + // No mapped reads in file, just start at the first read in file. + mInputStream.seek(mFirstRecordPointer); + } + mCurrentIterator = new BAMFileIndexUnmappedIterator(); + return mCurrentIterator; + } catch (IOException e) { + throw new RuntimeException("IOException seeking to unmapped reads", e); + } + } + + /** + * Reads the header from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private void readHeader(final String source) + throws IOException { + + final byte[] buffer = new byte[4]; + mStream.readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { + throw new IOException("Invalid BAM file header"); + } + + final int headerTextLength = mStream.readInt(); + final String textHeader = mStream.readString(headerTextLength); + final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); + headerCodec.setValidationStringency(mValidationStringency); + mFileHeader = headerCodec.decode(new StringLineReader(textHeader), + source); + + final int sequenceCount = mStream.readInt(); + if (mFileHeader.getSequenceDictionary().size() > 0) { + // It is allowed to have binary sequences but no text sequences, so only validate if both are present + if (sequenceCount != mFileHeader.getSequenceDictionary().size()) { + throw new SAMFormatException("Number of sequences in text header (" + + mFileHeader.getSequenceDictionary().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); + } + for (int i = 0; i < sequenceCount; i++) { + final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source); + final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); + if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + + source); + } + if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + + source); + } + } + } else { + // If only binary sequences are present, copy them into mFileHeader + final List sequences = new ArrayList(sequenceCount); + for (int i = 0; i < sequenceCount; i++) { + sequences.add(readSequenceRecord(source)); + } + mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); + } + } + + /** + * Reads a single binary sequence record from the file or stream + * @param source Note that this is used only for reporting errors. + */ + private SAMSequenceRecord readSequenceRecord(final String source) { + final int nameLength = mStream.readInt(); + if (nameLength <= 1) { + throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); + } + final String sequenceName = mStream.readString(nameLength - 1); + // Skip the null terminator + mStream.readByte(); + final int sequenceLength = mStream.readInt(); + return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); + } + + /** + * Iterator for non-indexed sequential iteration through all SAMRecords in file. + * Starting point of iteration is wherever current file position is when the iterator is constructed. + */ + private class BAMFileIterator implements CloseableIterator { + private SAMRecord mNextRecord = null; + private final BAMRecordCodec bamRecordCodec; + private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file + + BAMFileIterator() { + this(true); + } + + /** + * @param advance Trick to enable subclass to do more setup before advancing + */ + BAMFileIterator(final boolean advance) { + this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); + this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); + + if (advance) { + advance(); + } + } + + public void close() { + if (mCurrentIterator != null && this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + public boolean hasNext() { + return (mNextRecord != null); + } + + public SAMRecord next() { + final SAMRecord result = mNextRecord; + advance(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + void advance() { + try { + mNextRecord = getNextRecord(); + + if (mNextRecord != null) { + ++this.samRecordIndex; + // Because some decoding is done lazily, the record needs to remember the validation stringency. + mNextRecord.setValidationStringency(mValidationStringency); + + if (mValidationStringency != ValidationStringency.SILENT) { + final List validationErrors = mNextRecord.isValid(); + SAMUtils.processValidationErrors(validationErrors, + this.samRecordIndex, BAMFileReader.this.getValidationStringency()); + } + } + if (eagerDecode && mNextRecord != null) { + mNextRecord.eagerDecode(); + } + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + /** + * Read the next record from the input stream. + */ + SAMRecord getNextRecord() throws IOException { + final long startCoordinate = mInputStream.getFilePointer(); + final SAMRecord next = bamRecordCodec.decode(); + final long stopCoordinate = mInputStream.getFilePointer(); + + if(mFileReader != null && next != null) + next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); + + return next; + } + + /** + * @return The record that will be return by the next call to next() + */ + protected SAMRecord peek() { + return mNextRecord; + } + } + + /** + * Prepare to iterate through SAMRecords matching the target interval. + * @param sequence Desired reference sequence. + * @param start 1-based start of target interval, inclusive. + * @param end 1-based end of target interval, inclusive. + * @param queryType contained, overlapping, or starting-at query. + */ + private CloseableIterator createIndexIterator(final String sequence, + final int start, + final int end, + final QueryType queryType) { + long[] filePointers = null; + + // Hit the index to determine the chunk boundaries for the required data. + final SAMFileHeader fileHeader = getFileHeader(); + final int referenceIndex = fileHeader.getSequenceIndex(sequence); + if (referenceIndex != -1) { + final BAMIndex fileIndex = getIndex(); + final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end); + filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; + } + + // Create an iterator over the above chunk boundaries. + final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); + + // Add some preprocessing filters for edge-case reads that don't fit into this + // query type. + return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType); + } + + enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT} + + /** + * Look for BAM index file according to standard naming convention. + * + * @param dataFile BAM file name. + * @return Index file name, or null if not found. + */ + private static File findIndexFile(final File dataFile) { + // If input is foo.bam, look for foo.bai + final String bamExtension = ".bam"; + File indexFile; + final String fileName = dataFile.getName(); + if (fileName.endsWith(bamExtension)) { + final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix; + indexFile = new File(dataFile.getParent(), bai); + if (indexFile.exists()) { + return indexFile; + } + } + + // If foo.bai doesn't exist look for foo.bam.bai + indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai"); + if (indexFile.exists()) { + return indexFile; + } else { + return null; + } + } + + private class BAMFileIndexIterator extends BAMFileIterator { + + private long[] mFilePointers = null; + private int mFilePointerIndex = 0; + private long mFilePointerLimit = -1; + + /** + * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. + * @param filePointers the block / offset combination, stored in chunk format. + */ + BAMFileIndexIterator(final long[] filePointers) { + super(false); // delay advance() until after construction + mFilePointers = filePointers; + advance(); + } + + SAMRecord getNextRecord() + throws IOException { + // Advance to next file block if necessary + while (mInputStream.getFilePointer() >= mFilePointerLimit) { + if (mFilePointers == null || + mFilePointerIndex >= mFilePointers.length) { + return null; + } + final long startOffset = mFilePointers[mFilePointerIndex++]; + final long endOffset = mFilePointers[mFilePointerIndex++]; + mInputStream.seek(startOffset); + mFilePointerLimit = endOffset; + } + // Pull next record from stream + return super.getNextRecord(); + } + } + + /** + * A decorating iterator that filters out records that are outside the bounds of the + * given query parameters. + */ + private class BAMQueryFilteringIterator implements CloseableIterator { + /** + * The wrapped iterator. + */ + private final CloseableIterator wrappedIterator; + + /** + * The next record to be returned. Will be null if no such record exists. + */ + private SAMRecord mNextRecord; + + private final int mReferenceIndex; + private final int mRegionStart; + private final int mRegionEnd; + private final QueryType mQueryType; + + public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) { + this.wrappedIterator = iterator; + final SAMFileHeader fileHeader = getFileHeader(); + mReferenceIndex = fileHeader.getSequenceIndex(sequence); + mRegionStart = start; + if (queryType == QueryType.STARTING_AT) { + mRegionEnd = mRegionStart; + } else { + mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; + } + mQueryType = queryType; + mNextRecord = advance(); + } + + /** + * Returns true if a next element exists; false otherwise. + */ + public boolean hasNext() { + return mNextRecord != null; + } + + /** + * Gets the next record from the given iterator. + * @return The next SAM record in the iterator. + */ + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); + final SAMRecord currentRead = mNextRecord; + mNextRecord = advance(); + return currentRead; + } + + /** + * Closes down the existing iterator. + */ + public void close() { + if (this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + /** + * @throws UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + SAMRecord advance() { + while (true) { + // Pull next record from stream + if(!wrappedIterator.hasNext()) + return null; + + final SAMRecord record = wrappedIterator.next(); + // If beyond the end of this reference sequence, end iteration + final int referenceIndex = record.getReferenceIndex(); + if (referenceIndex != mReferenceIndex) { + if (referenceIndex < 0 || + referenceIndex > mReferenceIndex) { + return null; + } + // If before this reference sequence, continue + continue; + } + if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { + // Quick exit to avoid expensive alignment end calculation + return record; + } + final int alignmentStart = record.getAlignmentStart(); + // If read is unmapped but has a coordinate, return it if the coordinate is within + // the query region, regardless of whether the mapped mate will be returned. + final int alignmentEnd; + if (mQueryType == QueryType.STARTING_AT) { + alignmentEnd = -1; + } else { + alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START? + record.getAlignmentEnd(): alignmentStart); + } + + if (alignmentStart > mRegionEnd) { + // If scanned beyond target region, end iteration + return null; + } + // Filter for overlap with region + if (mQueryType == QueryType.CONTAINED) { + if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { + return record; + } + } else if (mQueryType == QueryType.OVERLAPPING) { + if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { + return record; + } + } else { + if (alignmentStart == mRegionStart) { + return record; + } + } + } + } + } + + private class BAMFileIndexUnmappedIterator extends BAMFileIterator { + private BAMFileIndexUnmappedIterator() { + while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + advance(); + } + } + } + +} diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java index 623f46291e..4692c66711 100644 --- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -25,6 +25,7 @@ package net.sf.samtools; import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Arrays; @@ -47,6 +48,18 @@ public GATKBAMFileSpan() { super(); } + /** + * Create a new GATKBAMFileSpan from an existing BAMFileSpan. + * @param sourceFileSpan + */ + public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) { + if(!(sourceFileSpan instanceof BAMFileSpan)) + throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead"); + BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan; + for(Chunk chunk: sourceBAMFileSpan.getChunks()) + add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk)); + } + /** * Convenience constructor to construct a BAM file span from * a single chunk. diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java index f590809e20..5d349e72e6 100644 --- a/public/java/src/net/sf/samtools/GATKChunk.java +++ b/public/java/src/net/sf/samtools/GATKChunk.java @@ -69,6 +69,22 @@ public void setChunkEnd(final long value) { super.setChunkEnd(value); } + public long getBlockStart() { + return getChunkStart() >>> 16; + } + + public int getBlockOffsetStart() { + return (int)(getChunkStart() & 0xFFFF); + } + + public long getBlockEnd() { + return getChunkEnd() >>> 16; + } + + public int getBlockOffsetEnd() { + return ((int)getChunkEnd() & 0xFFFF); + } + /** * Computes an approximation of the uncompressed size of the * chunk, in bytes. Can be used to determine relative weights diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java new file mode 100644 index 0000000000..d825c23d51 --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BAMInputStream.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package net.sf.samtools.util; + +import java.io.IOException; + +/** + * An input stream formulated for use reading BAM files. Supports + */ +public interface BAMInputStream { + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) throws IOException; + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer(); + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check); + + public int read() throws java.io.IOException; + + public int read(byte[] bytes) throws java.io.IOException; + + public int read(byte[] bytes, int i, int i1) throws java.io.IOException; + + public long skip(long l) throws java.io.IOException; + + public int available() throws java.io.IOException; + + public void close() throws java.io.IOException; + + public void mark(int i); + + public void reset() throws java.io.IOException; + + public boolean markSupported(); +} diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java new file mode 100755 index 0000000000..fae2fc89b4 --- /dev/null +++ b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java @@ -0,0 +1,483 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.samtools.util; + + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; + +import net.sf.samtools.FileTruncatedException; + +/* + * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream. + * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering. + * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the + * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used. + * + * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format + */ +public class BlockCompressedInputStream extends InputStream implements BAMInputStream { + private InputStream mStream = null; + private SeekableStream mFile = null; + private byte[] mFileBuffer = null; + private byte[] mCurrentBlock = null; + private int mCurrentOffset = 0; + private long mBlockAddress = 0; + private int mLastBlockLength = 0; + private final BlockGunzipper blockGunzipper = new BlockGunzipper(); + + + /** + * Note that seek() is not supported if this ctor is used. + */ + public BlockCompressedInputStream(final InputStream stream) { + mStream = IOUtil.toBufferedStream(stream); + mFile = null; + } + + /** + * Use this ctor if you wish to call seek() + */ + public BlockCompressedInputStream(final File file) + throws IOException { + mFile = new SeekableFileStream(file); + mStream = null; + + } + + public BlockCompressedInputStream(final URL url) { + mFile = new SeekableBufferedStream(new SeekableHTTPStream(url)); + mStream = null; + } + + /** + * For providing some arbitrary data source. No additional buffering is + * provided, so if the underlying source is not buffered, wrap it in a + * SeekableBufferedStream before passing to this ctor. + */ + public BlockCompressedInputStream(final SeekableStream strm) { + mFile = strm; + mStream = null; + } + + /** + * Determines whether or not the inflater will re-calculated the CRC on the decompressed data + * and check it against the value stored in the GZIP header. CRC checking is an expensive + * operation and should be used accordingly. + */ + public void setCheckCrcs(final boolean check) { + this.blockGunzipper.setCheckCrcs(check); + } + + /** + * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the + * next caller of a method for this input stream. The next caller might be the same thread or another thread. + * Note that although the next caller can read this many bytes without blocking, the available() method call itself + * may block in order to fill an internal buffer if it has been exhausted. + */ + public int available() + throws IOException { + if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { + readBlock(); + } + if (mCurrentBlock == null) { + return 0; + } + return mCurrentBlock.length - mCurrentOffset; + } + + /** + * Closes the underlying InputStream or RandomAccessFile + */ + public void close() + throws IOException { + if (mFile != null) { + mFile.close(); + mFile = null; + } else if (mStream != null) { + mStream.close(); + mStream = null; + } + // Encourage garbage collection + mFileBuffer = null; + mCurrentBlock = null; + } + + /** + * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255. + * If no byte is available because the end of the stream has been reached, the value -1 is returned. + * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown. + + * @return the next byte of data, or -1 if the end of the stream is reached. + */ + public int read() + throws IOException { + return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; + } + + /** + * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes + * actually read is returned as an integer. This method blocks until input data is available, end of file is detected, + * or an exception is thrown. + * + * read(buf) has the same effect as read(buf, 0, buf.length). + * + * @param buffer the buffer into which the data is read. + * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer) + throws IOException { + return read(buffer, 0, buffer.length); + } + + private volatile ByteArrayOutputStream buf = null; + private static final byte eol = '\n'; + private static final byte eolCr = '\r'; + + /** + * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'), + * carriage return ('\r') or carriage return followed by a line feed ("\r\n"). + * + * @return A String containing the contents of the line, excluding the line terminating + * character, or null if the end of the stream has been reached + * + * @exception IOException If an I/O error occurs + * @ + */ + public String readLine() throws IOException { + int available = available(); + if (available == 0) { + return null; + } + if(null == buf){ // lazy initialisation + buf = new ByteArrayOutputStream(8192); + } + buf.reset(); + boolean done = false; + boolean foundCr = false; // \r found flag + while (!done) { + int linetmpPos = mCurrentOffset; + int bCnt = 0; + while((available-- > 0)){ + final byte c = mCurrentBlock[linetmpPos++]; + if(c == eol){ // found \n + done = true; + break; + } else if(foundCr){ // previous char was \r + --linetmpPos; // current char is not \n so put it back + done = true; + break; + } else if(c == eolCr){ // found \r + foundCr = true; + continue; // no ++bCnt + } + ++bCnt; + } + if(mCurrentOffset < linetmpPos){ + buf.write(mCurrentBlock, mCurrentOffset, bCnt); + mCurrentOffset = linetmpPos; + } + available = available(); + if(available == 0){ + // EOF + done = true; + } + } + return buf.toString(); + } + + /** + * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read + * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer. + * + * This method blocks until input data is available, end of file is detected, or an exception is thrown. + * + * @param buffer buffer into which data is read. + * @param offset the start offset in array b at which the data is written. + * @param length the maximum number of bytes to read. + * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of + * the stream has been reached. + */ + public int read(final byte[] buffer, int offset, int length) + throws IOException { + final int originalLength = length; + while (length > 0) { + final int available = available(); + if (available == 0) { + // Signal EOF to caller + if (originalLength == length) { + return -1; + } + break; + } + final int copyLength = Math.min(length, available); + System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); + mCurrentOffset += copyLength; + offset += copyLength; + length -= copyLength; + } + return originalLength - length; + } + + /** + * Seek to the given position in the file. Note that pos is a special virtual file pointer, + * not an actual byte offset. + * + * @param pos virtual file pointer + */ + public void seek(final long pos) + throws IOException { + if (mFile == null) { + throw new IOException("Cannot seek on stream based file"); + } + // Decode virtual file pointer + // Upper 48 bits is the byte offset into the compressed stream of a block. + // Lower 16 bits is the byte offset into the uncompressed stream inside the block. + final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos); + final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos); + final int available; + if (mBlockAddress == compressedOffset && mCurrentBlock != null) { + available = mCurrentBlock.length; + } else { + mFile.seek(compressedOffset); + mBlockAddress = compressedOffset; + mLastBlockLength = 0; + readBlock(); + available = available(); + } + if (uncompressedOffset > available || + (uncompressedOffset == available && !eof())) { + throw new IOException("Invalid file pointer: " + pos); + } + mCurrentOffset = uncompressedOffset; + } + + private boolean eof() throws IOException { + if (mFile.eof()) { + return true; + } + // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF. + return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + } + + /** + * @return virtual file pointer that can be passed to seek() to return to the current position. This is + * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between + * the two. + */ + public long getFilePointer() { + if (mCurrentOffset == mCurrentBlock.length) { + // If current offset is at the end of the current block, file pointer should point + // to the beginning of the next block. + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0); + } + return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset); + } + + public static long getFileBlock(final long bgzfOffset) { + return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset); + } + + /** + * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported(). + * @return true if the given file looks like a valid BGZF file. + */ + public static boolean isValidFile(final InputStream stream) + throws IOException { + if (!stream.markSupported()) { + throw new RuntimeException("Cannot test non-buffered stream"); + } + stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; + final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + stream.reset(); + return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer); + } + + private static boolean isValidBlockHeader(final byte[] buffer) { + return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && + (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && + (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && + buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && + buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && + buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); + } + + private void readBlock() + throws IOException { + + if (mFileBuffer == null) { + mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; + } + int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + if (count == 0) { + // Handle case where there is no empty gzip block at end. + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mCurrentBlock = new byte[0]; + return; + } + if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { + throw new IOException("Premature end of file"); + } + final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; + if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { + throw new IOException("Unexpected compressed block length: " + blockLength); + } + final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; + count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); + if (count != remaining) { + throw new FileTruncatedException("Premature end of file"); + } + inflateBlock(mFileBuffer, blockLength); + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mLastBlockLength = blockLength; + } + + private void inflateBlock(final byte[] compressedBlock, final int compressedLength) + throws IOException { + final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); + byte[] buffer = mCurrentBlock; + mCurrentBlock = null; + if (buffer == null || buffer.length != uncompressedLength) { + try { + buffer = new byte[uncompressedLength]; + } catch (NegativeArraySizeException e) { + throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e); + } + } + blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength); + mCurrentBlock = buffer; + } + + private int readBytes(final byte[] buffer, final int offset, final int length) + throws IOException { + if (mFile != null) { + return readBytes(mFile, buffer, offset, length); + } else if (mStream != null) { + return readBytes(mStream, buffer, offset, length); + } else { + return 0; + } + } + + private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = file.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private int unpackInt16(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8)); + } + + private int unpackInt32(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8) | + ((buffer[offset+2] & 0xFF) << 16) | + ((buffer[offset+3] & 0xFF) << 24)); + } + + public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE} + + public static FileTermination checkTermination(final File file) + throws IOException { + final long fileSize = file.length(); + if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) { + return FileTermination.DEFECTIVE; + } + final RandomAccessFile raFile = new RandomAccessFile(file, "r"); + try { + raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length]; + raFile.readFully(buf); + if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) { + return FileTermination.HAS_TERMINATOR_BLOCK; + } + final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE); + buf = new byte[bufsize]; + raFile.seek(fileSize - bufsize); + raFile.read(buf); + for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; + i >= 0; --i) { + if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE, + buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) { + continue; + } + final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4); + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF; + if (buf.length - i == totalBlockSizeMinusOne + 1) { + return FileTermination.HAS_HEALTHY_LAST_BLOCK; + } else { + return FileTermination.DEFECTIVE; + } + } + return FileTermination.DEFECTIVE; + } finally { + raFile.close(); + } + } + + private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) { + for (int i = 0; i < length; ++i) { + if (preamble[i] != buf[i + startOffset]) { + return false; + } + } + return true; + } +} + + diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index bed1e710e6..9e1be5bca1 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -331,12 +331,12 @@ private static void errorPrintf(String format, Object... s) { * used to indicate an error occured * * @param msg the message - * @param e the error + * @param t the error */ - public static void exitSystemWithError(String msg, final Exception e) { + public static void exitSystemWithError(String msg, final Throwable t) { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("stack trace %n"); - e.printStackTrace(); + t.printStackTrace(); errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); @@ -392,10 +392,10 @@ public static void exitSystemWithSamError(final Exception e) { /** * used to indicate an error occured * - * @param e the exception occured + * @param t the exception that occurred */ - public static void exitSystemWithError(Exception e) { - exitSystemWithError(e.getMessage(), e); + public static void exitSystemWithError(Throwable t) { + exitSystemWithError(t.getMessage(), t); } /** diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java index 86ca6c2dfb..9e2c9a8183 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -45,7 +45,7 @@ * * The IntervalBinding is a formal GATK argument that bridges between a walker and * the engine to construct intervals for traversal at runtime. The IntervalBinding can - * either be a RodBinding, a string of one or more intervals, or a file with interval strings. + * either be a RodBinding, a string of one interval, or a file with interval strings. * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. * * Note that this class is immutable. @@ -92,7 +92,10 @@ public List getIntervals(GenomeAnalysisEngine toolkit) { codec.readHeader(lineReader); String line = lineReader.readLine(); while ( line != null ) { - intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(codec.decodeLoc(line))); + final Feature feature = codec.decodeLoc(line); + if ( feature == null ) + throw new UserException.MalformedFile(featureIntervals.getSource(), "Couldn't parse line '" + line + "'"); + intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(feature)); line = lineReader.readLine(); } } catch (IOException e) { @@ -105,4 +108,8 @@ public List getIntervals(GenomeAnalysisEngine toolkit) { return intervals; } + + public String toString() { + return getSource(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index b8488dc9a1..b4d337d8df 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.walkers.Attribution; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -97,13 +96,20 @@ public static void main(String[] argv) { // lazy loaded, so they aren't caught elsewhere and made into User Exceptions exitSystemWithUserError(e); } catch (net.sf.samtools.SAMException e) { - // Let's try this out and see how it is received by our users + checkForTooManyOpenFilesProblem(e.getMessage()); exitSystemWithSamError(e); - } catch (Exception e) { - exitSystemWithError(e); + } catch (Throwable t) { + checkForTooManyOpenFilesProblem(t.getMessage()); + exitSystemWithError(t); } } + private static void checkForTooManyOpenFilesProblem(String message) { + // Special case the "Too many open files" error because it's a common User Error for which we know what to do + if ( message != null && message.indexOf("Too many open files") != -1 ) + exitSystemWithUserError(new UserException.TooManyOpenFiles()); + } + /** * Creates the a short blurb about the GATK, copyright info, and where to get documentation. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f8e87aa586..f954d76501 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; @@ -126,6 +127,11 @@ public void setIntervals( GenomeLocSortedSet intervals ) { */ private Collection filters; + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + /** * A currently hacky unique name for this GATK instance */ @@ -199,6 +205,9 @@ public Object execute() { if (this.getArguments().nonDeterministicRandomSeed) resetRandomGenerator(System.currentTimeMillis()); + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + // Prepare the data for traversal. initializeDataSources(); @@ -218,7 +227,7 @@ public Object execute() { // create the output streams " initializeOutputStreams(microScheduler.getOutputTracker()); - ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); // execute the microscheduler, storing the results return microScheduler.execute(this.walker, shardStrategy); @@ -266,6 +275,32 @@ public Collection createFilters() { return Collections.unmodifiableList(filters); } + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); + + // TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters. + Integer numCPUThreads = null; + if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null) + throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); + else if(tags.containsKey("cpu")) + numCPUThreads = Integer.parseInt(tags.getValue("cpu")); + else if(argCollection.numberOfCPUThreads != null) + numCPUThreads = argCollection.numberOfCPUThreads; + + Integer numIOThreads = null; + if(tags.containsKey("io") && argCollection.numberOfIOThreads != null) + throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); + else if(tags.containsKey("io")) + numIOThreads = Integer.parseInt(tags.getValue("io")); + else if(argCollection.numberOfIOThreads != null) + numIOThreads = argCollection.numberOfIOThreads; + + this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads); + } + /** * Allow subclasses and others within this package direct access to the walker manager. * @return The walker manager used by this package. @@ -286,7 +321,7 @@ private MicroScheduler createMicroscheduler() { throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); } - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads); + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); } protected DownsamplingMethod getDownsamplingMethod() { @@ -397,103 +432,52 @@ protected void validateSuppliedIntervals() { * @param intervals intervals * @return the sharding strategy */ - protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original - // sharding system; it's required with the new sharding system only for locus walkers. - if(readsDataSource != null && !readsDataSource.hasIndex() ) { - if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - Shard.ShardType shardType; if(walker instanceof LocusWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - shardType = Shard.ShardType.LOCUS; + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); } - else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) - shardType = Shard.ShardType.READ; - else - throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files"); - - List region; - if(intervals != null) - region = intervals.toList(); - else { - region = new ArrayList(); - for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences()) - region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength())); + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer()); } - - return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region); + else + throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well + // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard + // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] + final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); } - - ShardStrategy shardStrategy; - ShardStrategyFactory.SHATTER_STRATEGY shardType; - - long SHARD_SIZE = 100000L; - - if (walker instanceof LocusWalker) { - if (walker instanceof RodWalker) SHARD_SIZE *= 1000; - - if (intervals != null && !intervals.isEmpty()) { - if (readsDataSource == null) - throw new IllegalArgumentException("readsDataSource is null"); - if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE,getGenomeLocParser()); - } else if (walker instanceof ReadWalker || - walker instanceof DuplicateWalker) { - shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL; - - if (intervals != null && !intervals.isEmpty()) { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser(), - intervals); - } else { - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } - } else if (walker instanceof ReadPairWalker) { - if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - - shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource.getReference(), - ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - getGenomeLocParser()); - } else - throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName()); - - return shardStrategy; } protected boolean flashbackData() { @@ -751,6 +735,8 @@ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection return new SAMDataSource( samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, genomeLocParser, argCollection.useOriginalBaseQualities, argCollection.strictnessLevel, @@ -763,8 +749,7 @@ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, getWalkerBAQQualityMode(), refReader, - argCollection.defaultBaseQualities, - !argCollection.disableLowMemorySharding); + argCollection.defaultBaseQualities); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index 93fa2d146c..daa8ff60db 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -30,7 +30,6 @@ public class ReadProperties { private Collection readers = null; private SAMFileHeader header = null; private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT; - private Integer readBufferSize = null; private DownsamplingMethod downsamplingMethod = null; private ValidationExclusion exclusionList = null; private Collection supplementalFilters = null; @@ -91,14 +90,6 @@ public SAMFileReader.ValidationStringency getValidationStringency() { return validationStringency; } - /** - * Gets a list of the total number of reads that the sharding system should buffer per BAM file. - * @return - */ - public Integer getReadBufferSize() { - return readBufferSize; - } - /** * Gets the method and parameters used when downsampling reads. * @return Downsample fraction. @@ -150,7 +141,6 @@ public byte defaultBaseQualities() { * @param header sam file header. * @param useOriginalBaseQualities True if original base qualities should be used. * @param strictness Stringency of reads file parsing. - * @param readBufferSize Number of reads to hold in memory per BAM. * @param downsamplingMethod Method for downsampling reads at a given locus. * @param exclusionList what safety checks we're willing to let slide * @param supplementalFilters additional filters to dynamically apply. @@ -169,7 +159,6 @@ public ReadProperties( Collection samFiles, SAMFileHeader header, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, - Integer readBufferSize, DownsamplingMethod downsamplingMethod, ValidationExclusion exclusionList, Collection supplementalFilters, @@ -181,7 +170,6 @@ public ReadProperties( Collection samFiles, byte defaultBaseQualities) { this.readers = samFiles; this.header = header; - this.readBufferSize = readBufferSize; this.validationStringency = strictness; this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 8078a1ea42..08d2c1ad15 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -194,10 +194,25 @@ public static DownsamplingMethod getDefaultDownsamplingMethod() { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false) - public int numberOfThreads = 1; + /** How many threads should be allocated to this analysis. */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) + public Integer numberOfThreads = 1; - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false) + /** + * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types. + * TODO: Kill this when I can do a tagged integer in Queue. + */ + @Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false) + @Hidden + public Integer numberOfCPUThreads = null; + @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) + @Hidden + public Integer numberOfIOThreads = null; + + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) + public Integer numberOfBAMFileHandles = null; + + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; // -------------------------------------------------------------------------------------------------------------- @@ -292,9 +307,6 @@ public static DownsamplingMethod getDefaultDownsamplingMethod() { @Hidden public boolean allowIntervalsWithUnindexedBAM = false; - @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false) - public boolean disableLowMemorySharding = false; - // -------------------------------------------------------------------------------------------------------------- // // methods @@ -365,7 +377,19 @@ public boolean equals(GATKArgumentCollection other) { (other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) { return false; } - if (other.numberOfThreads != this.numberOfThreads) { + if (!other.numberOfThreads.equals(this.numberOfThreads)) { + return false; + } + if ((this.numberOfCPUThreads == null && other.numberOfCPUThreads != null) || + this.numberOfCPUThreads.equals(other.numberOfCPUThreads) ) { + return false; + } + if ((this.numberOfIOThreads == null && other.numberOfIOThreads != null) || + this.numberOfIOThreads.equals(other.numberOfIOThreads) ) { + return false; + } + if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) || + (other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) { return false; } if (other.intervalMerging != this.intervalMerging) { @@ -389,9 +413,6 @@ public boolean equals(GATKArgumentCollection other) { if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) return false; - if (disableLowMemorySharding != other.disableLowMemorySharding) - return false; - return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java index 4e75f3ddbd..d589f90292 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java @@ -131,7 +131,7 @@ public static Map splitContextByReadGroup( } } - public static Map splitContextBySampleName(ReadBackedPileup pileup, String assumedSingleSample) { + public static Map splitContextBySampleName(ReadBackedPileup pileup) { return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index c38b093343..54f8b44edc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -80,7 +80,7 @@ public RodLocusView( LocusShardDataProvider provider ) { // grab the ROD iterator from the data source, and compute the first location in this shard, forwarding // the iterator to immediately before it, so that it can be added to the merging iterator primed for // next() to return the first real ROD in this shard - LocationAwareSeekableRODIterator it = dataSource.seek(provider.getShard()); + LocationAwareSeekableRODIterator it = dataSource.seek(provider.getLocus()); it.seekForward(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart()-1)); states.add(new ReferenceOrderedDataState(dataSource,it)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java deleted file mode 100644 index de938e8453..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 7, 2011 - * Time: 2:46:34 PM - * To change this template use File | Settings | File Templates. - */ -public class BAMBlockStartIterator implements Iterator { - /** - * How large is a BGZF header? - */ - private static int BGZF_HEADER_SIZE = 18; - - /** - * Where within the header does the BLOCKSIZE actually live? - */ - private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2; - - private FileChannel bamInputChannel; - private ByteBuffer headerByteBuffer; - - private long nextLocation = 0; - - public BAMBlockStartIterator(File bamFile) { - try { - FileInputStream bamInputStream = new FileInputStream(bamFile); - bamInputChannel = bamInputStream.getChannel(); - - headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE); - headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN); - - } - catch(IOException ex) { - throw new StingException("Could not open file",ex); - } - } - - public boolean hasNext() { - return nextLocation != -1; - } - - public Long next() { - long currentLocation = nextLocation; - advance(); - return currentLocation; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator"); - } - - private void advance() { - int readStatus; - - headerByteBuffer.clear(); - try { - readStatus = bamInputChannel.read(headerByteBuffer); - } - catch(IOException ex) { - throw new StingException("Could not read header data",ex); - } - - if(readStatus == -1) { - nextLocation = -1; - try { - bamInputChannel.close(); - } - catch(IOException ex) { - throw new StingException("Could not close input file",ex); - } - return; - } - - headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION); - int blockSize = headerByteBuffer.getShort(); - - try { - bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1); - nextLocation = bamInputChannel.position(); - } - catch(IOException ex) { - throw new StingException("Could not reposition input stream",ex); - } - } - - public static void main(String argv[]) throws IOException { - BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam")); - int i = 0; - while(blockStartIterator.hasNext()) - System.out.printf("%d -> %d%n",i++,blockStartIterator.next()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java deleted file mode 100644 index 4d91fb45f4..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.GATKBin; -import net.sf.samtools.GATKChunk; -import net.sf.samtools.LinearIndex; - -import java.util.*; - -/** - * Represents the contents of a bam index file for one reference. - * A BAM index (.bai) file contains information for all references in the bam file. - * This class describes the data present in the index file for one of these references; - * including the bins, chunks, and linear index. - */ -class BAMIndexContent { - /** - * The reference sequence for the data currently loaded. - */ - private final int mReferenceSequence; - - /** - * A list of all bins in the above reference sequence. - */ - private final BinList mBinList; - - /** - * The linear index for the reference sequence above. - */ - private final LinearIndex mLinearIndex; - - - /** - * @param referenceSequence Content corresponds to this reference. - * @param bins Array of bins represented by this content, possibly sparse - * @param numberOfBins Number of non-null bins - * @param linearIndex Additional index used to optimize queries - */ - BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) { - this.mReferenceSequence = referenceSequence; - this.mBinList = new BinList(bins, numberOfBins); - this.mLinearIndex = linearIndex; - } - - /** - * Reference for this Content - */ - public int getReferenceSequence() { - return mReferenceSequence; - } - - /** - * Does this content have anything in this bin? - */ - public boolean containsBin(final GATKBin bin) { - return mBinList.getBin(bin.getBinNumber()) != null; - } - - /** - * @return iterable list of bins represented by this content - */ - public BinList getBins() { - return mBinList; - } - - /** - * @return the number of non-null bins represented by this content - */ - int getNumberOfNonNullBins() { - return mBinList.getNumberOfNonNullBins(); - } - - /** - * @return all chunks associated with all bins in this content - */ - public List getAllChunks() { - List allChunks = new ArrayList(); - for (GATKBin b : mBinList) - if (b.getChunkList() != null) { - allChunks.addAll(Arrays.asList(b.getChunkList())); - } - return Collections.unmodifiableList(allChunks); - } - - /** - * @return the linear index represented by this content - */ - public LinearIndex getLinearIndex() { - return mLinearIndex; - } - - /** - * This class is used to encapsulate the list of Bins store in the BAMIndexContent - * While it is currently represented as an array, we may decide to change it to an ArrayList or other structure - */ - class BinList implements Iterable { - - private final GATKBin[] mBinArray; - public final int numberOfNonNullBins; - public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based - - /** - * @param binArray a sparse array representation of the bins. The index into the array is the bin number. - * @param numberOfNonNullBins - */ - BinList(GATKBin[] binArray, int numberOfNonNullBins) { - this.mBinArray = binArray; - this.numberOfNonNullBins = numberOfNonNullBins; - this.maxBinNumber = mBinArray.length - 1; - } - - GATKBin getBin(int binNumber) { - if (binNumber > maxBinNumber) return null; - return mBinArray[binNumber]; - } - - int getNumberOfNonNullBins() { - return numberOfNonNullBins; - } - - /** - * Gets an iterator over all non-null bins. - * - * @return An iterator over all bins. - */ - public Iterator iterator() { - return new BinIterator(); - } - - private class BinIterator implements Iterator { - /** - * Stores the bin # of the Bin currently in use. - */ - private int nextBin; - - public BinIterator() { - nextBin = 0; - } - - /** - * Are there more bins in this set, waiting to be returned? - * - * @return True if more bins are remaining. - */ - public boolean hasNext() { - while (nextBin <= maxBinNumber) { - if (getBin(nextBin) != null) return true; - nextBin++; - } - return false; - } - - /** - * Gets the next bin in the provided BinList. - * - * @return the next available bin in the BinList. - */ - public GATKBin next() { - if (!hasNext()) - throw new NoSuchElementException("This BinIterator is currently empty"); - GATKBin result = getBin(nextBin); - nextBin++; - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from a bin iterator"); - } - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java deleted file mode 100644 index 15a372ca67..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; - -import java.util.HashMap; -import java.util.Map; - -/** - * Models a bin at which all BAM files in the merged input stream overlap. - */ -class BAMOverlap { - public final int start; - public final int stop; - - private final Map bins = new HashMap(); - - public BAMOverlap(final int start, final int stop) { - this.start = start; - this.stop = stop; - } - - public void addBin(final SAMReaderID id, final Bin bin) { - bins.put(id,bin); - } - - public Bin getBin(final SAMReaderID id) { - return bins.get(id); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java index 521bcd5a3d..762722fcd0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java @@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator { /** * Create a new BAM schedule based on the given index. - * @param indexFiles Index files. + * @param dataSource The SAM data source to use. * @param intervals List of */ - public BAMSchedule(final Map indexFiles, final List intervals) { + public BAMSchedule(final SAMDataSource dataSource, final List intervals) { if(intervals.isEmpty()) throw new ReviewedStingException("Tried to write schedule for empty interval list."); - referenceSequence = intervals.get(0).getContigIndex(); + referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); createScheduleFile(); - readerIDs.addAll(indexFiles.keySet()); + readerIDs.addAll(dataSource.getReaderIDs()); for(final SAMReaderID reader: readerIDs) { - final GATKBAMIndex index = indexFiles.get(reader); + final GATKBAMIndex index = dataSource.getIndex(reader); final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); @@ -237,7 +237,10 @@ private void advance() { if(selectedIterators.isEmpty()) return; + // Create the target schedule entry BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); + + // For each schedule entry with data, load the data into the merged schedule. for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { PeekableIterator scheduleIterator = scheduleIterators.get(reader); BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); @@ -248,6 +251,11 @@ private void advance() { scheduleIterator.next(); } + // For each schedule entry without data, add a blank entry. + for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { + mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); + } + nextScheduleEntry = mergedScheduleEntry; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 47eb55b288..dca4cc7710 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -27,7 +27,12 @@ import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileSpan; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.*; @@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator { private FilePointer nextFilePointer = null; - private final GenomeLocSortedSet loci; + private GenomeLocSortedSet loci; + private PeekableIterator locusIterator; + private GenomeLoc currentLocus; + + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); + for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { + // Match only on sequence name; trust startup validation to make sure all the sequences match. + if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) + intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); + } + scheduler.populateFilteredIntervalList(intervals); + return scheduler; + } - private final PeekableIterator locusIterator; + public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateUnfilteredIntervalList(parser); + return scheduler; + } + + public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + BAMScheduler scheduler = new BAMScheduler(dataSource); + scheduler.populateFilteredIntervalList(loci); + return scheduler; + } - private GenomeLoc currentLocus; - public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + private BAMScheduler(final SAMDataSource dataSource) { this.dataSource = dataSource; - for(SAMReaderID reader: dataSource.getReaderIDs()) - indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); + for(SAMReaderID reader: dataSource.getReaderIDs()) { + GATKBAMIndex index = dataSource.getIndex(reader); + if(index != null) + indexFiles.put(reader,dataSource.getIndex(reader)); + } + } + + /** + * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. + * @param loci The list of locations to search and iterate over. + */ + private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - advance(); + if(!indexFiles.isEmpty()) { + // If index data is available, start up the iterator. + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + else { + // Otherwise, seed the iterator with a single file pointer over the entire region. + nextFilePointer = generatePointerOverEntireFileset(); + for(GenomeLoc locus: loci) + nextFilePointer.addLocation(locus); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + } + } + + /** + * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching + * from just before the start of the region to the end of the region. + */ + private void populateUnfilteredIntervalList(final GenomeLocParser parser) { + this.loci = new GenomeLocSortedSet(parser); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + nextFilePointer = generatePointerOverEntireFileset(); + } + + /** + * Generate a span that runs from the end of the BAM header to the end of the fle. + * @return A file pointer over the specified region. + */ + private FilePointer generatePointerOverEntireFileset() { + FilePointer filePointer = new FilePointer(); + Map currentPosition = dataSource.getCurrentPosition(); + for(SAMReaderID reader: dataSource.getReaderIDs()) + filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); + return filePointer; } public boolean hasNext() { @@ -67,7 +137,9 @@ public FilePointer next() { if(!hasNext()) throw new NoSuchElementException("No next element available in interval sharder"); FilePointer currentFilePointer = nextFilePointer; + nextFilePointer = null; advance(); + return currentFilePointer; } @@ -79,13 +151,12 @@ private void advance() { if(loci.isEmpty()) return; - nextFilePointer = null; while(nextFilePointer == null && currentLocus != null) { // special case handling of the unmapped shard. if(currentLocus == GenomeLoc.UNMAPPED) { nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE))); + nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); currentLocus = null; continue; } @@ -96,7 +167,7 @@ private void advance() { int coveredRegionStop = Integer.MAX_VALUE; GenomeLoc coveredRegion = null; - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus); + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); // No overlapping data at all. if(scheduleEntry != null) { @@ -108,7 +179,6 @@ private void advance() { } else { // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. - //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex()); for(SAMReaderID reader: indexFiles.keySet()) nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); } @@ -116,21 +186,13 @@ private void advance() { // Early exit if no bins were found. if(coveredRegion == null) { // for debugging only: maximum split is 16384. - if(currentLocus.size() > 16384) { - GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - } - else { - nextFilePointer.addLocation(currentLocus); - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - } + nextFilePointer.addLocation(currentLocus); + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; continue; } // Early exit if only part of the first interval was found. if(currentLocus.startsBefore(coveredRegion)) { - // for debugging only: maximum split is 16384. int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); GenomeLoc[] splitContigs = currentLocus.split(splitPoint); nextFilePointer.addLocation(splitContigs[0]); @@ -175,25 +237,30 @@ else if(locusIterator.hasNext()) /** * Get the next overlapping tree of bins associated with the given BAM file. - * @param indices BAM indices. * @param currentLocus The actual locus for which to check overlap. * @return The next schedule entry overlapping with the given list of loci. */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { + // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. + // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then + // we'll be using the correct contig index for the BAMs. + // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. + final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex(); + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. - if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { + if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { if(bamScheduleIterator != null) bamScheduleIterator.close(); - lastReferenceSequenceLoaded = currentLocus.getContigIndex(); + lastReferenceSequenceLoaded = currentContigIndex; // Naive algorithm: find all elements in current contig for proper schedule creation. List lociInContig = new LinkedList(); for(GenomeLoc locus: loci) { - if(locus.getContigIndex() == lastReferenceSequenceLoaded) + if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) lociInContig.add(locus); } - bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); + bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); } if(!bamScheduleIterator.hasNext()) @@ -209,4 +276,13 @@ private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map inputQueue; + + public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) { + threadPool = Executors.newFixedThreadPool(numThreads); + fileHandleCache = new FileHandleCache(numFileHandles); + inputQueue = new LinkedList(); + + threadPool.execute(new BlockLoader(this,fileHandleCache,true)); + } + + /** + * Initiates a request for a new block load. + * @param readerPosition Position at which to load. + */ + void queueBlockLoad(final SAMReaderPosition readerPosition) { + synchronized(inputQueue) { + inputQueue.add(readerPosition); + inputQueue.notify(); + } + } + + /** + * Claims the next work request from the queue. + * @return The next work request, or null if none is available. + */ + SAMReaderPosition claimNextWorkRequest() { + synchronized(inputQueue) { + while(inputQueue.isEmpty()) { + try { + inputQueue.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item"); + } + } + return inputQueue.poll(); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java new file mode 100644 index 0000000000..e377f865df --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.util.BAMInputStream; +import net.sf.samtools.util.BlockCompressedFilePointerUtil; +import net.sf.samtools.util.BlockCompressedInputStream; +import net.sf.samtools.util.RuntimeEOFException; +import net.sf.samtools.util.SeekableStream; +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.LinkedList; + +/** + * Presents decompressed blocks to the SAMFileReader. + */ +public class BlockInputStream extends SeekableStream implements BAMInputStream { + /** + * Mechanism for triggering block loads. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * The reader whose data is supplied by this input stream. + */ + private final SAMReaderID reader; + + /** + * Length of the input stream. + */ + private final long length; + + /** + * The latest error reported by an asynchronous block load. + */ + private Throwable error; + + /** + * Current position. + */ + private SAMReaderPosition position; + + /** + * A stream of compressed data blocks. + */ + private final ByteBuffer buffer; + + /** + * Offsets of the given blocks in the buffer. + */ + private LinkedList blockOffsets = new LinkedList(); + + /** + * Source positions of the given blocks in the buffer. + */ + private LinkedList blockPositions = new LinkedList(); + + /** + * Provides a lock to wait for more data to arrive. + */ + private final Object lock = new Object(); + + /** + * An input stream to use when comparing data back to what it should look like. + */ + private final BlockCompressedInputStream validatingInputStream; + + /** + * Has the buffer been filled since last request? + */ + private boolean bufferFilled = false; + + /** + * Create a new block presenting input stream with a dedicated buffer. + * @param dispatcher the block loading messenger. + * @param reader the reader for which to load data. + * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. + */ + BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { + this.reader = reader; + this.length = reader.samFile.length(); + + buffer = ByteBuffer.wrap(new byte[64*1024]); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + // The state of the buffer assumes that the range of data written into the buffer appears in the range + // [position,limit), while extra capacity exists in the range [limit,capacity) + buffer.limit(0); + + this.dispatcher = dispatcher; + // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. + this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + + try { + if(validate) { + System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); + validatingInputStream = new BlockCompressedInputStream(reader.samFile); + // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. + // Poke the stream to start reading data. + validatingInputStream.available(); + } + else + validatingInputStream = null; + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + public long length() { + return length; + } + + public long getFilePointer() { + long filePointer; + synchronized(lock) { + if(buffer.remaining() > 0) { + // If there's data in the buffer, figure out from whence it came. + final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0; + final int blockOffset = buffer.position(); + filePointer = blockAddress << 16 | blockOffset; + } + else { + // Otherwise, find the next position to load. + filePointer = position.getBlockAddress() << 16; + } + } + + if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) + throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", + BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer), + BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()))); + + return filePointer; + } + + public void seek(long target) { + // TODO: Validate the seek point. + //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target)); + synchronized(lock) { + clearBuffers(); + position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target)); + waitForBufferFill(); + buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target)); + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(target); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + } + + private void clearBuffers() { + this.position.reset(); + + // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. + // Indicate no data to be read. + buffer.clear(); + buffer.limit(0); + + blockOffsets.clear(); + blockPositions.clear(); + } + + public boolean eof() { + synchronized(lock) { + // TODO: Handle multiple empty BGZF blocks at end of the file. + return position != null && position.getBlockAddress() >= length; + } + } + + public void setCheckCrcs(final boolean check) { + // TODO: Implement + } + + /** + * Submits a new access plan for the given dataset. + * @param position The next seek point for BAM data in this reader. + */ + public void submitAccessPlan(final SAMReaderPosition position) { + //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); + synchronized(lock) { + // Assume that the access plan is going to tell us to start where we are and move forward. + // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset. + if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress()) + position.advancePosition(this.position.getBlockAddress()); + } + this.position = position; + } + + private void compactBuffer() { + // Compact buffer to maximize storage space. + int bytesToRemove = 0; + + // Look ahead to see if we can compact away the first block in the series. + while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) { + bytesToRemove += blockOffsets.remove(); + blockPositions.remove(); + } + + // If we end up with an empty block at the end of the series, compact this as well. + if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) { + bytesToRemove += buffer.position(); + blockOffsets.remove(); + blockPositions.remove(); + } + + int finalBufferStart = buffer.position() - bytesToRemove; + int finalBufferSize = buffer.remaining(); + + buffer.position(bytesToRemove); + buffer.compact(); + + buffer.position(finalBufferStart); + buffer.limit(finalBufferStart+finalBufferSize); + } + + /** + * Push contents of incomingBuffer into the end of this buffer. + * MUST be called from a thread that is NOT the reader thread. + * @param incomingBuffer The data being pushed into this input stream. + * @param position target position for the data. + */ + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) { + synchronized(lock) { + try { + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); + + // Advance the position to take the most recent read into account. + long lastReadPosition = position.getBlockAddress(); + + byte[] validBytes = null; + if(validatingInputStream != null) { + validBytes = new byte[incomingBuffer.remaining()]; + + byte[] currentBytes = new byte[incomingBuffer.remaining()]; + int pos = incomingBuffer.position(); + int lim = incomingBuffer.limit(); + incomingBuffer.get(currentBytes); + + incomingBuffer.limit(lim); + incomingBuffer.position(pos); + + long currentFilePointer = validatingInputStream.getFilePointer(); + validatingInputStream.seek(lastReadPosition << 16); + validatingInputStream.read(validBytes); + validatingInputStream.seek(currentFilePointer); + + if(!Arrays.equals(validBytes,currentBytes)) + throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); + } + + this.position = position; + position.advancePosition(filePosition); + + if(buffer.remaining() < incomingBuffer.remaining()) { + //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining()); + lock.wait(); + //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining()); + } + + // Queue list of block offsets / block positions. + blockOffsets.add(buffer.position()); + blockPositions.add(lastReadPosition); + + buffer.put(incomingBuffer); + + // Set up the buffer for reading. + buffer.flip(); + bufferFilled = true; + + lock.notify(); + } + catch(Exception ex) { + reportException(ex); + lock.notify(); + } + } + } + + void reportException(Throwable t) { + synchronized(lock) { + this.error = t; + lock.notify(); + } + } + + private void checkForErrors() { + synchronized(lock) { + if(error != null) { + ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); + toThrow.setStackTrace(error.getStackTrace()); + throw toThrow; + } + } + } + + /** + * Reads the next byte of data from the input stream. + * @return Next byte of data, from 0->255, as an int. + */ + @Override + public int read() { + byte[] singleByte = new byte[1]; + read(singleByte); + return singleByte[0]; + } + + /** + * Fills the given byte array to the extent possible. + * @param bytes byte array to be filled. + * @return The number of bytes actually read. + */ + @Override + public int read(byte[] bytes) { + return read(bytes,0,bytes.length); + } + + @Override + public int read(byte[] bytes, final int offset, final int length) { + int remaining = length; + synchronized(lock) { + while(remaining > 0) { + // Check for error conditions during last read. + checkForErrors(); + + // If completely out of space, queue up another buffer fill. + waitForBufferFill(); + + // Couldn't manage to load any data at all; abort and return what's available. + if(buffer.remaining() == 0) + break; + + int numBytesToCopy = Math.min(buffer.remaining(),remaining); + buffer.get(bytes,length-remaining+offset,numBytesToCopy); + remaining -= numBytesToCopy; + + //if(remaining > 0) + // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); + // TODO: Assert that we don't copy across a block boundary + } + + // Notify any waiting threads that some of the contents of the buffer were removed. + if(length-remaining > 0) + lock.notify(); + } + + if(validatingInputStream != null) { + byte[] validBytes = new byte[length]; + try { + validatingInputStream.read(validBytes,offset,length); + for(int i = offset; i < offset+length; i++) { + if(bytes[i] != validBytes[i]) { + System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId()); + throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); + } + } + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + + return length - remaining; + } + + public void close() { + if(validatingInputStream != null) { + try { + validatingInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to validate against Picard input stream",ex); + } + } + } + + public String getSource() { + return reader.getSamFilePath(); + } + + private void waitForBufferFill() { + synchronized(lock) { + bufferFilled = false; + if(buffer.remaining() == 0 && !eof()) { + //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); + dispatcher.queueBlockLoad(position); + try { + lock.wait(); + } + catch(InterruptedException ex) { + // TODO: handle me. + throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex); + } + + if(bufferFilled && buffer.remaining() == 0) + throw new RuntimeEOFException("No more data left in InputStream"); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java new file mode 100644 index 0000000000..ab42998026 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broad.tribble.util.BlockCompressedStreamConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; + +/** + * An engine for loading blocks. + */ +class BlockLoader implements Runnable { + /** + * Coordinates the input queue. + */ + private BGZFBlockLoadingDispatcher dispatcher; + + /** + * A cache from which to retrieve open file handles. + */ + private final FileHandleCache fileHandleCache; + + /** + * Whether asynchronous decompression should happen. + */ + private final boolean decompress; + + /** + * An direct input buffer for incoming data from disk. + */ + private final ByteBuffer inputBuffer; + + public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) { + this.dispatcher = dispatcher; + this.fileHandleCache = fileHandleCache; + this.decompress = decompress; + + this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length); + inputBuffer.order(ByteOrder.LITTLE_ENDIAN); + } + + public void run() { + for(;;) { + SAMReaderPosition readerPosition = null; + try { + readerPosition = dispatcher.claimNextWorkRequest(); + FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader()); + + long blockAddress = readerPosition.getBlockAddress(); + //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream()); + + ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress()); + long nextBlockAddress = position(inputStream); + fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream); + + ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock; + int bytesCopied = block.remaining(); + + BlockInputStream bamInputStream = readerPosition.getInputStream(); + bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress); + + //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream()); + } + catch(Throwable error) { + if(readerPosition != null && readerPosition.getInputStream() != null) + readerPosition.getInputStream().reportException(error); + } + } + + } + + private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException { + FileChannel channel = inputStream.getChannel(); + + // Read the block header + channel.position(blockAddress); + + int uncompressedDataSize = 0; + int bufferSize = 0; + + do { + inputBuffer.clear(); + inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + inputBuffer.flip(); + if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) + throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass."); + + // Verify that the file was read at a valid point. + if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 || + unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 || + unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG || + unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN || + unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 || + unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) { + throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position"); + } + + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET); + bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1; + + // Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block. + inputBuffer.limit(bufferSize); + inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + channel.read(inputBuffer); + + // Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block. + uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4); + //System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize); + } + while(uncompressedDataSize == 0 && channel.position() < channel.size()); + + // Prepare the buffer for reading. + inputBuffer.flip(); + + return inputBuffer; + } + + private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException { + final int compressedBufferSize = bgzfBlock.remaining(); + + // Determine the uncompressed buffer size ( + bgzfBlock.position(bgzfBlock.limit()-4); + int uncompressedBufferSize = bgzfBlock.getInt(); + byte[] uncompressedContent = new byte[uncompressedBufferSize]; + + // Bound the CDATA section of the buffer. + bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH); + bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + byte[] compressedContent = new byte[bgzfBlock.remaining()]; + ByteBuffer.wrap(compressedContent).put(bgzfBlock); + + // Decompress the buffer. + final Inflater inflater = new Inflater(true); + inflater.setInput(compressedContent); + int bytesUncompressed = inflater.inflate(uncompressedContent); + if(bytesUncompressed != uncompressedBufferSize) + throw new ReviewedStingException("Error decompressing block"); + + return ByteBuffer.wrap(uncompressedContent); + } + + private long position(final FileInputStream inputStream) throws IOException { + return inputStream.getChannel().position(); + } + + private int unpackUByte8(final ByteBuffer buffer,final int position) { + return buffer.get(position) & 0xFF; + } + + private int unpackUInt16(final ByteBuffer buffer,final int position) { + // Read out the size of the full BGZF block into a two bit short container, then 'or' that + // value into an int buffer to transfer the bitwise contents into an int. + return buffer.getShort(position) & 0xFFFF; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java new file mode 100644 index 0000000000..29de6eb370 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; + +/** + * Caches frequently used file handles. Right now, caches only a single file handle. + * TODO: Generalize to support arbitrary file handle caches. + */ +public class FileHandleCache { + /** + * The underlying data structure storing file handles. + */ + private final FileHandleStorage fileHandleStorage; + + /** + * How many file handles should be kept open at once. + */ + private final int cacheSize; + + /** + * A uniquifier: assign a unique ID to every instance of a file handle. + */ + private final Map keyCounter = new HashMap(); + + /** + * A shared lock, private so that outside users cannot notify it. + */ + private final Object lock = new Object(); + + /** + * Indicates how many file handles are outstanding at this point. + */ + private int numOutstandingFileHandles = 0; + + /** + * Create a new file handle cache of the given cache size. + * @param cacheSize how many readers to hold open at once. + */ + public FileHandleCache(final int cacheSize) { + this.cacheSize = cacheSize; + fileHandleStorage = new FileHandleStorage(); + } + + /** + * Retrieves or opens a file handle for the given reader ID. + * @param key The ke + * @return A file input stream from the cache, if available, or otherwise newly opened. + */ + public FileInputStream claimFileInputStream(final SAMReaderID key) { + synchronized(lock) { + FileInputStream inputStream = findExistingEntry(key); + if(inputStream == null) { + try { + // If the cache is maxed out, wait for another file handle to emerge. + if(numOutstandingFileHandles >= cacheSize) + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedStingException("Interrupted while waiting for a file handle"); + } + inputStream = openInputStream(key); + } + numOutstandingFileHandles++; + + //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); + return inputStream; + } + } + + /** + * Releases the current reader and returns it to the cache. + * @param key The reader. + * @param inputStream The stream being used. + */ + public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { + synchronized(lock) { + numOutstandingFileHandles--; + UniqueKey newID = allocateKey(key); + fileHandleStorage.put(newID,inputStream); + // Let any listeners know that another file handle has become available. + lock.notify(); + } + } + + /** + * Finds an existing entry in the storage mechanism. + * @param key Reader. + * @return a cached stream, if available. Otherwise, + */ + private FileInputStream findExistingEntry(final SAMReaderID key) { + int existingHandles = getMostRecentUniquifier(key); + + // See if any of the keys currently exist in the repository. + for(int i = 0; i <= existingHandles; i++) { + UniqueKey uniqueKey = new UniqueKey(key,i); + if(fileHandleStorage.containsKey(uniqueKey)) + return fileHandleStorage.remove(uniqueKey); + } + + return null; + } + + /** + * Gets the most recent uniquifier used for the given reader. + * @param reader Reader for which to determine uniqueness. + * @return + */ + private int getMostRecentUniquifier(final SAMReaderID reader) { + if(keyCounter.containsKey(reader)) + return keyCounter.get(reader); + else return -1; + } + + private UniqueKey allocateKey(final SAMReaderID reader) { + int uniquifier = getMostRecentUniquifier(reader)+1; + keyCounter.put(reader,uniquifier); + return new UniqueKey(reader,uniquifier); + } + + private FileInputStream openInputStream(final SAMReaderID reader) { + try { + return new FileInputStream(reader.getSamFilePath()); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + private void closeInputStream(final FileInputStream inputStream) { + try { + inputStream.close(); + } + catch(IOException ex) { + throw new StingException("Unable to open input file"); + } + } + + /** + * Actually contains the file handles, purging them as they get too old. + */ + private class FileHandleStorage extends LinkedHashMap { + /** + * Remove the oldest entry + * @param entry Entry to consider removing. + * @return True if the cache size has been exceeded. False otherwise. + */ + @Override + protected boolean removeEldestEntry(Map.Entry entry) { + synchronized (lock) { + if(size() > cacheSize) { + keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); + closeInputStream(entry.getValue()); + + return true; + } + } + return false; + } + } + + /** + * Uniquifies a key by adding a numerical uniquifier. + */ + private class UniqueKey { + /** + * The file handle's key. + */ + private final SAMReaderID key; + + /** + * A uniquifier, so that multiple of the same reader can exist in the cache. + */ + private final int uniqueID; + + public UniqueKey(final SAMReaderID reader, final int uniqueID) { + this.key = reader; + this.uniqueID = uniqueID; + } + + @Override + public boolean equals(Object other) { + if(!(other instanceof UniqueKey)) + return false; + UniqueKey otherUniqueKey = (UniqueKey)other; + return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; + } + + @Override + public int hashCode() { + return key.hashCode(); + } + } + + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index e4141f61c9..df7827250e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMFileSpan; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -40,28 +41,25 @@ */ public class FilePointer { protected final SortedMap fileSpans = new TreeMap(); - protected final BAMOverlap overlap; - protected final List locations; + protected final List locations = new ArrayList(); /** * Does this file pointer point into an unmapped region? */ protected final boolean isRegionUnmapped; - public FilePointer() { - this((BAMOverlap)null); - } - - public FilePointer(final GenomeLoc location) { - this.overlap = null; - this.locations = Collections.singletonList(location); - this.isRegionUnmapped = GenomeLoc.isUnmapped(location); - } - - public FilePointer(final BAMOverlap overlap) { - this.overlap = overlap; - this.locations = new ArrayList(); - this.isRegionUnmapped = false; + public FilePointer(final GenomeLoc... locations) { + this.locations.addAll(Arrays.asList(locations)); + boolean foundMapped = false, foundUnmapped = false; + for(GenomeLoc location: locations) { + if(GenomeLoc.isUnmapped(location)) + foundUnmapped = true; + else + foundMapped = true; + } + if(foundMapped && foundUnmapped) + throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); + this.isRegionUnmapped = foundUnmapped; } /** @@ -217,4 +215,20 @@ private void mergeElementsInto(final FilePointer combined, Iterator entry: fileSpans.entrySet()) { + builder.append(entry.getKey()); + builder.append("= {"); + builder.append(entry.getValue()); + builder.append("}"); + } + return builder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 4ddf28dced..f78693c27f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -25,419 +25,58 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.AbstractBAMFileIndex; -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.*; +import java.util.Iterator; /** - * Shard intervals based on position within the BAM file. - * - * @author mhanna - * @version 0.1 + * Handles the process of aggregating BAM intervals into individual shards. + * TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder. */ -public class IntervalSharder { - private static Logger logger = Logger.getLogger(IntervalSharder.class); - - public static Iterator shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - return new IntervalSharder.FilePointerIterator(dataSource,loci); - } - +public class IntervalSharder implements Iterator { /** - * A lazy-loading iterator over file pointers. + * The iterator actually laying out the data for BAM scheduling. */ - private static class FilePointerIterator implements Iterator { - final SAMDataSource dataSource; - final GenomeLocSortedSet loci; - final PeekableIterator locusIterator; - final Queue cachedFilePointers = new LinkedList(); - - public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - this.dataSource = dataSource; - this.loci = loci; - locusIterator = new PeekableIterator(loci.iterator()); - advance(); - } - - public boolean hasNext() { - return !cachedFilePointers.isEmpty(); - } - - public FilePointer next() { - if(!hasNext()) - throw new NoSuchElementException("FilePointerIterator iteration is complete"); - FilePointer filePointer = cachedFilePointers.remove(); - if(cachedFilePointers.isEmpty()) - advance(); - return filePointer; - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator"); - } - - private void advance() { - GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser()); - String contig = null; - - // If the next section of the BAM to be processed is unmapped, handle this region separately. - while(locusIterator.hasNext() && nextBatch.isEmpty()) { - contig = null; - while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) { - GenomeLoc nextLocus = locusIterator.next(); - contig = nextLocus.getContig(); - nextBatch.add(nextLocus); - } - } - - if(nextBatch.size() > 0) { - cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch)); - } - } - } + private final PeekableIterator wrappedIterator; /** - * Merge / split intervals based on an awareness of the structure of the BAM file. - * @param dataSource - * @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads. - * @param loci - * @return + * The parser, for interval manipulation. */ - private static List shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) { - // If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs. - if(contig == null) { - FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED); - for(SAMReaderID id: dataSource.getReaderIDs()) - filePointer.addFileSpans(id,null); - return Collections.singletonList(filePointer); - } - - // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. - List filePointers = new ArrayList(); - FilePointer lastFilePointer = null; - BAMOverlap lastBAMOverlap = null; - - Map readerToIndexMap = new HashMap(); - IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator(); - for(SAMReaderID id: dataSource.getReaderIDs()) { - final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig); - // If this contig can't be found in the reference, skip over it. - if(referenceSequence == null && contig != null) - continue; - final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id); - binMerger.addReader(id, - index, - referenceSequence.getSequenceIndex(), - index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator()); - // Cache the reader for later data lookup. - readerToIndexMap.put(id,index); - } - - PeekableIterator binIterator = new PeekableIterator(binMerger); - - for(GenomeLoc location: loci) { - if(!location.getContig().equals(contig)) - throw new ReviewedStingException("Location outside bounds of contig"); - - if(!binIterator.hasNext()) - break; - - int locationStart = location.getStart(); - final int locationStop = location.getStop(); - - // Advance to first bin. - while(binIterator.peek().stop < locationStart) - binIterator.next(); + private final GenomeLocParser parser; - // Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make - // sure the extending bin is not pruned from the list. - List bamOverlaps = new ArrayList(); - while(binIterator.hasNext() && binIterator.peek().stop <= locationStop) - bamOverlaps.add(binIterator.next()); - if(binIterator.hasNext() && binIterator.peek().start <= locationStop) - bamOverlaps.add(binIterator.peek()); - - // Bins found; try to match bins with locations. - Iterator bamOverlapIterator = bamOverlaps.iterator(); - - while(locationStop >= locationStart) { - int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0; - int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0; - - while(binStop < locationStart && bamOverlapIterator.hasNext()) { - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - lastBAMOverlap = bamOverlapIterator.next(); - lastFilePointer = new FilePointer(lastBAMOverlap); - binStart = lastFilePointer.overlap.start; - binStop = lastFilePointer.overlap.stop; - } - - if(locationStart < binStart) { - // The region starts before the first bin in the sequence. Add the region occurring before the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - final int regionStop = Math.min(locationStop,binStart-1); - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop); - lastFilePointer = new FilePointer(subset); - - locationStart = regionStop + 1; - } - else if(locationStart > binStop) { - // The region starts after the last bin in the sequence. Add the region occurring after the sequence. - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) { - filePointers.add(lastFilePointer); - lastFilePointer = null; - lastBAMOverlap = null; - } - - GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop); - filePointers.add(new FilePointer(subset)); - - locationStart = locationStop + 1; - } - else { - if(lastFilePointer == null) - throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer."); - - // The start of the region overlaps the bin. Add the overlapping subset. - final int regionStop = Math.min(locationStop,binStop); - lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop)); - locationStart = regionStop + 1; - } - } - } - - if(lastFilePointer != null && lastFilePointer.locations.size() > 0) - filePointers.add(lastFilePointer); - - // Lookup the locations for every file pointer in the index. - for(SAMReaderID id: readerToIndexMap.keySet()) { - BrowseableBAMIndex index = readerToIndexMap.get(id); - for(FilePointer filePointer: filePointers) - filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id))); - } - - return filePointers; + public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); } - private static class BinMergingIterator implements Iterator { - private PriorityQueue binQueue = new PriorityQueue(); - private Queue pendingOverlaps = new LinkedList(); - - public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator bins) { - binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins))); - } - - public boolean hasNext() { - return pendingOverlaps.size() > 0 || !binQueue.isEmpty(); - } - - public BAMOverlap next() { - if(!hasNext()) - throw new NoSuchElementException("No elements left in merging iterator"); - if(pendingOverlaps.isEmpty()) - advance(); - return pendingOverlaps.remove(); - } - - public void advance() { - List bins = new ArrayList(); - int boundsStart, boundsStop; - - // Prime the pump - if(binQueue.isEmpty()) - return; - bins.add(getNextBin()); - boundsStart = bins.get(0).getStart(); - boundsStop = bins.get(0).getStop(); - - // Accumulate all the bins that overlap the current bin, in sorted order. - while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) { - ReaderBin bin = getNextBin(); - bins.add(bin); - boundsStart = Math.min(boundsStart,bin.getStart()); - boundsStop = Math.max(boundsStop,bin.getStop()); - } - - List> range = new ArrayList>(); - int start = bins.get(0).getStart(); - int stop = bins.get(0).getStop(); - while(start <= boundsStop) { - // Find the next stopping point. - for(ReaderBin bin: bins) { - stop = Math.min(stop,bin.getStop()); - if(start < bin.getStart()) - stop = Math.min(stop,bin.getStart()-1); - } - - range.add(new Pair(start,stop)); - // If the last entry added included the last element, stop. - if(stop >= boundsStop) - break; - - // Find the next start. - start = stop + 1; - for(ReaderBin bin: bins) { - if(start >= bin.getStart() && start <= bin.getStop()) - break; - else if(start < bin.getStart()) { - start = bin.getStart(); - break; - } - } - } - - // Add the next series of BAM overlaps to the window. - for(Pair window: range) { - BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second); - for(ReaderBin bin: bins) - bamOverlap.addBin(bin.id,bin.bin); - pendingOverlaps.add(bamOverlap); - } - } - - public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); } - - private ReaderBin peekNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.peek(); - return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin()); - } - - private ReaderBin getNextBin() { - if(binQueue.isEmpty()) - throw new NoSuchElementException("No more bins are available"); - BinQueueState current = binQueue.remove(); - ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin()); - if(current.hasNextBin()) - binQueue.add(current); - return readerBin; - } - + public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); } - /** - * Filters out bins not at the lowest level in the tree. - */ - private static class LowestLevelBinFilteringIterator implements Iterator { - private BrowseableBAMIndex index; - private Iterator wrappedIterator; - - private Bin nextBin; - - public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator iterator) { - this.index = index; - this.wrappedIterator = iterator; - advance(); - } - - public boolean hasNext() { - return nextBin != null; - } - - public Bin next() { - Bin bin = nextBin; - advance(); - return bin; - } - - public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); } - - private void advance() { - nextBin = null; - while(wrappedIterator.hasNext() && nextBin == null) { - Bin bin = wrappedIterator.next(); - if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1) - nextBin = bin; - } - } + public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser()); } -} - -class BinQueueState implements Comparable { - private final SAMReaderID id; - private final BrowseableBAMIndex index; - private final int referenceSequence; - private final PeekableIterator bins; - private int firstLocusInCurrentBin; - private int lastLocusInCurrentBin; - - public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator bins) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bins = new PeekableIterator(bins); - refreshLocusInBinCache(); - } - - public SAMReaderID getReaderID() { - return id; - } - - public BrowseableBAMIndex getIndex() { - return index; - } - - public int getReferenceSequence() { - return referenceSequence; - } - - public boolean hasNextBin() { - return bins.hasNext(); + private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) { + wrappedIterator = new PeekableIterator(scheduler); + this.parser = parser; } - public Bin peekNextBin() { - return bins.peek(); + public boolean hasNext() { + return wrappedIterator.hasNext(); } - public Bin nextBin() { - Bin nextBin = bins.next(); - refreshLocusInBinCache(); - return nextBin; - } - - public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) { - if(!this.bins.hasNext() && !other.bins.hasNext()) return 0; - if(!this.bins.hasNext()) return -1; - if(!this.bins.hasNext()) return 1; - - // Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid. - if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 || - other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) { - throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid."); - } - - // Straight integer subtraction works here because lhsStart, rhsStart always positive. - if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin) - return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin; - - // Straight integer subtraction works here because lhsStop, rhsStop always positive. - return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin; + /** + * Accumulate shards where there's no additional cost to processing the next shard in the sequence. + * @return The next file pointer to process. + */ + public FilePointer next() { + FilePointer current = wrappedIterator.next(); + while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); + return current; } - private void refreshLocusInBinCache() { - firstLocusInCurrentBin = -1; - lastLocusInCurrentBin = -1; - if(bins.hasNext()) { - Bin bin = bins.peek(); - firstLocusInCurrentBin = index.getFirstLocusInBin(bin); - lastLocusInCurrentBin = index.getLastLocusInBin(bin); - } - } -} \ No newline at end of file + public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java new file mode 100644 index 0000000000..585b63457c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import java.util.Iterator; + +/** + * Batch granular file pointers into potentially larger shards. + */ +public class LocusShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + public boolean hasNext() { + return filePointers.hasNext(); + } + + public Shard next() { + FilePointer current = filePointers.next(); + while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0) + current = current.combine(parser,filePointers.next()); + return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans); + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + }; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java deleted file mode 100755 index a5ca078534..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileSpan; -import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -/** - * A sharding strategy for loci based on reading of the index. - */ -public class LocusShardStrategy implements ShardStrategy { - /** - * The data source to use when performing this sharding. - */ - private final SAMDataSource reads; - - /** - * the parser for creating shards - */ - private GenomeLocParser genomeLocParser; - - /** - * An iterator through the available file pointers. - */ - private final Iterator filePointerIterator; - - /** - * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs - * @param reads Data source from which to load index data. - * @param locations List of locations for which to load data. - */ - public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) { - this.reads = reads; - this.genomeLocParser = genomeLocParser; - - if(!reads.isEmpty()) { - GenomeLocSortedSet intervals; - if(locations == null) { - // If no locations were passed in, shard the entire BAM file. - SAMFileHeader header = reads.getHeader(); - intervals = new GenomeLocSortedSet(genomeLocParser); - - for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) { - // Check this sequence against the reference sequence dictionary. - // TODO: Do a better job of merging reads + reference. - SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName()); - if(refSequenceRecord != null) { - final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength()); - intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length)); - } - } - } - else - intervals = locations; - - if(reads.isLowMemoryShardingEnabled()) { - /* - Iterator filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - List filePointers = new ArrayList(); - while(filePointerIterator.hasNext()) - filePointers.add(filePointerIterator.next()); - this.filePointerIterator = filePointers.iterator(); - */ - this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); - } - else - this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals); - } - else { - final int maxShardSize = 100000; - List filePointers = new ArrayList(); - if(locations == null) { - for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { - for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { - final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); - filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop))); - } - } - } - else { - for(GenomeLoc interval: locations) { - while(interval.size() > maxShardSize) { - filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1))); - interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); - } - filePointers.add(new FilePointer(interval)); - } - } - filePointerIterator = filePointers.iterator(); - } - - } - - /** - * returns true if there are additional shards - * - * @return false if we're done processing shards - */ - public boolean hasNext() { - return filePointerIterator.hasNext(); - } - - public long shardNumber = 0; - - /** - * gets the next Shard - * - * @return the next shard - */ - public LocusShard next() { - FilePointer nextFilePointer = filePointerIterator.next(); - Map fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null; - - /* - System.out.printf("Shard %d: interval = {",++shardNumber); - for(GenomeLoc locus: nextFilePointer.locations) - System.out.printf("%s;",locus); - System.out.printf("}; "); - - if(fileSpansBounding == null) - System.out.printf("no shard data%n"); - else { - SortedMap sortedSpans = new TreeMap(fileSpansBounding); - for(Map.Entry entry: sortedSpans.entrySet()) { - System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue()); - } - } - */ - - return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding); - } - - /** we don't support the remove command */ - public void remove() { - throw new UnsupportedOperationException("ShardStrategies don't support remove()"); - } - - /** - * makes the IntervalShard iterable, i.e. usable in a for loop. - * - * @return - */ - public Iterator iterator() { - return this; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java deleted file mode 100644 index bf5f33dc34..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.Iterator; - -/** - * Handles the process of aggregating BAM intervals into individual shards. - */ -public class LowMemoryIntervalSharder implements Iterator { - /** - * The iterator actually laying out the data for BAM scheduling. - */ - private final PeekableIterator wrappedIterator; - - /** - * The parser, for interval manipulation. - */ - private final GenomeLocParser parser; - - public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { - wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci)); - parser = loci.getGenomeLocParser(); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - /** - * Accumulate shards where there's no additional cost to processing the next shard in the sequence. - * @return The next file pointer to process. - */ - public FilePointer next() { - FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) - current = current.combine(parser,wrappedIterator.next()); - return current; - } - - public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java deleted file mode 100644 index 278eeb8989..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.List; - -/** - * A single, monolithic shard bridging all available data. - * @author mhanna - * @version 0.1 - */ -public class MonolithicShard extends Shard { - /** - * Creates a new monolithic shard of the given type. - * @param shardType Type of the shard. Must be either read or locus; cannot be intervalic. - * @param locs Intervals that this monolithic shard should process. - */ - public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List locs) { - super(parser, shardType, locs, readsDataSource, null, false); - if(shardType != ShardType.LOCUS && shardType != ShardType.READ) - throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType); - } - - /** - * String representation of this shard. - * @return "entire genome". - */ - @Override - public String toString() { - return "entire genome"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java deleted file mode 100644 index 28b737f283..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Create a giant shard representing all the data in the input BAM(s). - * - * @author mhanna - * @version 0.1 - */ -public class MonolithicShardStrategy implements ShardStrategy { - /** - * The single shard associated with this sharding strategy. - */ - private MonolithicShard shard; - - /** - * Create a new shard strategy for shards of the given type. - * @param shardType The shard type. - */ - public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List region) { - shard = new MonolithicShard(parser,readsDataSource,shardType,region); - } - - /** - * Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator; - * will only return another copy of the active iterator. - * @return A copy of this. - */ - public Iterator iterator() { - return this; - } - - /** - * Returns true if the monolithic shard has not yet been consumed, or false otherwise. - * @return True if shard has been consumed, false otherwise. - */ - public boolean hasNext() { - return shard != null; - } - - /** - * Returns the monolithic shard if it has not already been retrieved. - * @return The monolithic shard. - * @throws NoSuchElementException if no such data exists. - */ - public Shard next() { - if(shard == null) - throw new NoSuchElementException("Monolithic shard has already been retrived."); - - Shard working = shard; - shard = null; - return working; - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - */ - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a shard strategy"); - } - - /** - * Mandated by the interface, but is unsupported in this context. Will throw an exception always. - * @param size adjust the next size to this - */ - public void adjustNextShardSize( long size ) { - throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard."); - } - -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 4d9c9092d6..8d73b1b158 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -35,15 +35,29 @@ * @version 0.1 */ public class ReadShard extends Shard { + /** + * What is the maximum number of reads which should go into a read shard. + */ + public static int MAX_READS = 10000; + /** * The reads making up this shard. */ - private final Collection reads = new ArrayList(ReadShardStrategy.MAX_READS); + private final Collection reads = new ArrayList(MAX_READS); public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); } + /** + * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface + * until we know what effect tuning this parameter has. + * @param bufferSize New maximum number + */ + static void setReadBufferSize(final int bufferSize) { + MAX_READS = bufferSize; + } + /** * Returns true if this shard is meant to buffer reads, rather * than just holding pointers to their locations. @@ -66,7 +80,7 @@ public boolean isBufferEmpty() { * @return True if this shard's buffer is full (and the shard can buffer reads). */ public boolean isBufferFull() { - return reads.size() > ReadShardStrategy.MAX_READS; + return reads.size() > ReadShard.MAX_READS; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java new file mode 100644 index 0000000000..311c7874f9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + */ +public class ReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + selectedReaders.put(id,fileSpan); + } + + if(!isEmpty(selectedReaders)) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + + /** + * Detects whether the list of file spans contain any read data. + * @param selectedSpans Mapping of readers to file spans. + * @return True if file spans are completely empty; false otherwise. + */ + private boolean isEmpty(Map selectedSpans) { + for(SAMFileSpan fileSpan: selectedSpans.values()) { + if(!fileSpan.isEmpty()) + return false; + } + return true; + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java deleted file mode 100755 index 5ea75dbb0f..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardStrategy.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.SAMFileSpan; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.*; - -/** - * The sharding strategy for reads using a simple counting mechanism. Each read shard - * has a specific number of reads (default to 10K) which is configured in the constructor. - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - */ -public class ReadShardStrategy implements ShardStrategy { - /** - * What is the maximum number of reads which should go into a read shard. - */ - protected static final int MAX_READS = 10000; - - /** - * The data source used to shard. - */ - private final SAMDataSource dataSource; - - /** - * The intervals to be processed. - */ - private final GenomeLocSortedSet locations; - - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** our storage of the genomic locations they'd like to shard over */ - private final List filePointers = new ArrayList(); - - /** - * Iterator over the list of file pointers. - */ - private final Iterator filePointerIterator; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentFilePointer; - - /** - * Ending position of the last shard in the file. - */ - private Map position; - - /** - * An indicator whether the strategy has sharded into the unmapped region. - */ - private boolean isIntoUnmappedRegion = false; - - private final GenomeLocParser parser; - - /** - * Create a new read shard strategy, loading read shards from the given BAM file. - * @param dataSource Data source from which to load shards. - * @param locations intervals to use for sharding. - */ - public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) { - this.dataSource = dataSource; - this.parser = parser; - this.position = this.dataSource.getCurrentPosition(); - this.locations = locations; - - if(locations != null) - filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations); - else - filePointerIterator = filePointers.iterator(); - - if(filePointerIterator.hasNext()) - currentFilePointer = filePointerIterator.next(); - - advance(); - } - - /** - * do we have another read shard? - * @return True if any more data is available. False otherwise. - */ - public boolean hasNext() { - return nextShard != null; - } - - /** - * Retrieves the next shard, if available. - * @return The next shard, if available. - * @throws java.util.NoSuchElementException if no such shard is available. - */ - public Shard next() { - if(!hasNext()) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - public void advance() { - Map shardPosition = new HashMap(); - nextShard = null; - - if(locations != null) { - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; - - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id)); - if(!fileSpan.isEmpty()) - selectedReaders.put(id,fileSpan); - } - - if(selectedReaders.size() > 0) { - Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - dataSource.fillShard(shard); - - if(!shard.isBufferEmpty()) { - nextShard = shard; - break; - } - } - - selectedReaders.clear(); - currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null; - } - } - else { - // todo -- this nulling of intervals is a bit annoying since readwalkers without - // todo -- any -L values need to be special cased throughout the code. - Shard shard = new ReadShard(parser,dataSource,position,null,false); - dataSource.fillShard(shard); - nextShard = !shard.isBufferEmpty() ? shard : null; - } - - this.position = dataSource.getCurrentPosition(); - } - - /** - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - - /** - * Convenience method for using ShardStrategy in an foreach loop. - * @return A iterator over shards. - */ - public Iterator iterator() { - return this; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java deleted file mode 100644 index c76c1d8ae9..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReaderBin.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 2, 2011 - * Time: 4:36:40 PM - * To change this template use File | Settings | File Templates. - */ -class ReaderBin { - public final SAMReaderID id; - public final BrowseableBAMIndex index; - public final int referenceSequence; - public final Bin bin; - - public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) { - this.id = id; - this.index = index; - this.referenceSequence = referenceSequence; - this.bin = bin; - } - - public int getStart() { - return index.getFirstLocusInBin(bin); - } - - public int getStop() { - return index.getLastLocusInBin(bin); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 8452aadfd9..2e243b8473 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -37,8 +37,11 @@ import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -49,6 +52,7 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; +import java.util.concurrent.*; /** * User: aaron @@ -60,6 +64,9 @@ public class SAMDataSource { final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); + /** If true, we will load SAMReaders in parallel */ + final private static boolean USE_PARALLEL_LOADING = false; + /** Backing support for reads. */ protected final ReadProperties readProperties; @@ -71,7 +78,7 @@ public class SAMDataSource { /** * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. */ - private final GenomeLocParser genomeLocParser; + protected final GenomeLocParser genomeLocParser; /** * Identifiers for the readers driving this data source. @@ -91,13 +98,18 @@ public class SAMDataSource { /** * How far along is each reader? */ - private final Map readerPositions = new HashMap(); + private final Map readerPositions = new HashMap(); /** * The merged header. */ private final SAMFileHeader mergedHeader; + /** + * The constituent headers of the unmerged files. + */ + private final Map headers = new HashMap(); + /** * The sort order of the BAM files. Files without a sort order tag are assumed to be * in coordinate order. @@ -131,17 +143,24 @@ public class SAMDataSource { private final SAMResourcePool resourcePool; /** - * Whether to enable the new low-memory sharding mechanism. + * Asynchronously loads BGZF blocks. */ - private boolean enableLowMemorySharding = false; + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * How are threads allocated. + */ + private final ThreadAllocation threadAllocation; /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. */ - public SAMDataSource(Collection samFiles,GenomeLocParser genomeLocParser) { + public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, false, SAMFileReader.ValidationStringency.STRICT, @@ -150,8 +169,7 @@ public SAMDataSource(Collection samFiles,GenomeLocParser genomeLocP new ValidationExclusion(), new ArrayList(), false, - false, - true); + false); } /** @@ -159,6 +177,8 @@ public SAMDataSource(Collection samFiles,GenomeLocParser genomeLocP */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -167,9 +187,10 @@ public SAMDataSource( ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents, - boolean enableLowMemorySharding) { + boolean generateExtendedEvents) { this( samFiles, + threadAllocation, + numFileHandles, genomeLocParser, useOriginalBaseQualities, strictness, @@ -182,9 +203,8 @@ public SAMDataSource( BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ - (byte) -1, - enableLowMemorySharding); - } + (byte) -1); + } /** * Create a new SAM data source given the supplied read metadata. @@ -205,6 +225,8 @@ public SAMDataSource( */ public SAMDataSource( Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, GenomeLocParser genomeLocParser, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, @@ -217,28 +239,45 @@ public SAMDataSource( BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, - byte defaultBaseQualities, - boolean enableLowMemorySharding) { - this.enableLowMemorySharding(enableLowMemorySharding); + byte defaultBaseQualities) { this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; readerIDs = samFiles; + + this.threadAllocation = threadAllocation; + // TODO: Consider a borrowed-thread dispatcher implementation. + if(this.threadAllocation.getNumIOThreads() > 0) { + logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads()); + dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); + } + else + dispatcher = null; + validationStringency = strictness; - for (SAMReaderID readerID : samFiles) { - if (!readerID.samFile.canRead()) - throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + - "Please check that the file is present and readable and try again."); + if(readBufferSize != null) + ReadShard.setReadBufferSize(readBufferSize); + else { + // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively + // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. + ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); // Determine the sort order. - for(SAMFileReader reader: readers.values()) { + for(SAMReaderID readerID: readerIDs) { + if (! readerID.samFile.canRead() ) + throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + + "Please check that the file is present and readable and try again."); + // Get the sort order, forcing it to coordinate if unsorted. + SAMFileReader reader = readers.getReader(readerID); SAMFileHeader header = reader.getFileHeader(); + headers.put(readerID,header); + if ( header.getReadGroups().isEmpty() ) { throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); @@ -256,16 +295,14 @@ public SAMDataSource( initializeReaderPositions(readers); - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); - mergedHeader = headerMerger.getMergedHeader(); - hasReadGroupCollisions = headerMerger.hasReadGroupCollisions(); + mergedHeader = readers.getMergedHeader(); + hasReadGroupCollisions = readers.hasReadGroupCollisions(); readProperties = new ReadProperties( samFiles, mergedHeader, useOriginalBaseQualities, strictness, - readBufferSize, downsamplingMethod, exclusionList, supplementalFilters, @@ -275,7 +312,7 @@ public SAMDataSource( qmode, refReader, defaultBaseQualities); - + // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -284,9 +321,9 @@ public SAMDataSource( List readGroups = reader.getFileHeader().getReadGroups(); for(SAMReadGroupRecord readGroup: readGroups) { - if(headerMerger.hasReadGroupCollisions()) { - mappingToMerged.put(readGroup.getReadGroupId(),headerMerger.getReadGroupId(reader,readGroup.getReadGroupId())); - mergedToOriginalReadGroupMappings.put(headerMerger.getReadGroupId(reader,readGroup.getReadGroupId()),readGroup.getReadGroupId()); + if(hasReadGroupCollisions) { + mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId())); + mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId()); } else { mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); @@ -296,12 +333,10 @@ public SAMDataSource( originalToMergedReadGroupMappings.put(id,mappingToMerged); } - if(enableLowMemorySharding) { - for(SAMReaderID id: readerIDs) { - File indexFile = findIndexFile(id.samFile); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); - } + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.samFile); + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); } resourcePool.releaseReaders(readers); @@ -314,22 +349,6 @@ public SAMDataSource( */ public ReadProperties getReadsInfo() { return readProperties; } - /** - * Enable experimental low-memory sharding. - * @param enable True to enable sharding. False otherwise. - */ - public void enableLowMemorySharding(final boolean enable) { - enableLowMemorySharding = enable; - } - - /** - * Returns whether low-memory sharding is enabled. - * @return True if enabled, false otherwise. - */ - public boolean isLowMemoryShardingEnabled() { - return enableLowMemorySharding; - } - /** * Checks to see whether any reads files are supplying data. * @return True if no reads files are supplying data to the traversal; false otherwise. @@ -368,7 +387,7 @@ public SAMReaderID getReaderID(SAMRecord read) { * Retrieves the current position within the BAM file. * @return A mapping of reader to current position. */ - public Map getCurrentPosition() { + public Map getCurrentPosition() { return readerPositions; } @@ -381,7 +400,7 @@ public SAMFileHeader getHeader() { } public SAMFileHeader getHeader(SAMReaderID id) { - return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader(); + return headers.get(id); } /** @@ -404,45 +423,21 @@ public String getOriginalReadGroupId(final String mergedReadGroupId) { return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); } - /** - * No read group collisions at this time because only one SAM file is currently supported. - * @return False always. - */ - public boolean hasReadGroupCollisions() { - return hasReadGroupCollisions; - } - /** * True if all readers have an index. * @return True if all readers have an index. */ public boolean hasIndex() { - if(enableLowMemorySharding) - return readerIDs.size() == bamIndices.size(); - else { - for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) { - if(!reader.hasIndex()) - return false; - } - return true; - } + return readerIDs.size() == bamIndices.size(); } /** * Gets the index for a particular reader. Always preloaded. - * TODO: Should return object of type GATKBAMIndex, but cannot because there - * TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new - * TODO: sharding system goes live. * @param id Id of the reader. * @return The index. Will preload the index if necessary. */ - public Object getIndex(final SAMReaderID id) { - if(enableLowMemorySharding) - return bamIndices.get(id); - else { - SAMReaders readers = resourcePool.getReadersWithoutLocking(); - return readers.getReader(id).getBrowseableIndex(); - } + public GATKBAMIndex getIndex(final SAMReaderID id) { + return bamIndices.get(id); } /** @@ -454,7 +449,7 @@ public SAMFileHeader.SortOrder getSortOrder() { } /** - * Gets the cumulative read metrics for shards already processed. + * Gets the cumulative read metrics for shards already processed. * @return Cumulative read metrics. */ public ReadMetrics getCumulativeReadMetrics() { @@ -486,10 +481,13 @@ public void fillShard(Shard shard) { // Cache the most recently viewed read so that we can check whether we've reached the end of a pair. SAMRecord read = null; + Map positionUpdates = new IdentityHashMap(); + CloseableIterator iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate); while(!shard.isBufferFull() && iterator.hasNext()) { read = iterator.next(); - addReadToBufferingShard(shard,getReaderID(readers,read),read); + shard.addRead(read); + noteFilePositionUpdate(positionUpdates,read); } // If the reads are sorted in queryname order, ensure that all reads @@ -499,18 +497,24 @@ public void fillShard(Shard shard) { SAMRecord nextRead = iterator.next(); if(read == null || !read.getReadName().equals(nextRead.getReadName())) break; - addReadToBufferingShard(shard,getReaderID(readers,nextRead),nextRead); + shard.addRead(nextRead); + noteFilePositionUpdate(positionUpdates,nextRead); } } iterator.close(); + + // Make the updates specified by the reader. + for(Map.Entry positionUpdate: positionUpdates.entrySet()) + readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue()); } - public StingSAMIterator seek(Shard shard) { - // todo: refresh monolithic sharding implementation - if(shard instanceof MonolithicShard) - return seekMonolithic(shard); + private void noteFilePositionUpdate(Map positionMapping, SAMRecord read) { + GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); + positionMapping.put(read.getFileSource().getReader(),endChunk); + } + public StingSAMIterator seek(Shard shard) { if(shard.buffersReads()) { return shard.iterator(); } @@ -540,7 +544,7 @@ private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { */ private void initializeReaderPositions(SAMReaders readers) { for(SAMReaderID id: getReaderIDs()) - readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads()); + readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); } /** @@ -548,25 +552,26 @@ private void initializeReaderPositions(SAMReaders readers) { * @param readers Readers from which to load data. * @param shard The shard specifying the data limits. * @param enableVerification True to verify. For compatibility with old sharding strategy. - * TODO: Collapse this flag when the two sharding systems are merged. * @return An iterator over the selected data. */ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); - // Set up merging to dynamically merge together multiple BAMs. - MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); + MergingSamRecordIterator mergingIterator = readers.createMergingIterator(); for(SAMReaderID id: getReaderIDs()) { CloseableIterator iterator = null; - if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null) - continue; - iterator = shard.getFileSpans().get(id) != null ? - readers.getReader(id).iterator(shard.getFileSpans().get(id)) : - readers.getReader(id).queryUnmapped(); - if(readProperties.getReadBufferSize() != null) - iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize()); - if(shard.getGenomeLocs() != null) + + // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. + // TODO: Kill this check once we've proven that the design elements are gone. + if(shard.getFileSpans().get(id) == null) + throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); + + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id))); + } + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); mergingIterator.addIterator(readers.getReader(id),iterator); } @@ -584,45 +589,6 @@ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean en readProperties.defaultBaseQualities()); } - /** - * A stopgap measure to handle monolithic sharding - * @param shard the (monolithic) shard. - * @return An iterator over the monolithic shard. - */ - private StingSAMIterator seekMonolithic(Shard shard) { - SAMReaders readers = resourcePool.getAvailableReaders(); - - // Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set. - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true); - MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true); - for(SAMReaderID id: getReaderIDs()) - mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator()); - - return applyDecoratingIterators(shard.getReadMetrics(), - shard instanceof ReadShard, - readProperties.useOriginalBaseQualities(), - new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), - readProperties.getDownsamplingMethod().toFraction, - readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters(), - readProperties.getBAQCalculationMode(), - readProperties.getBAQQualityMode(), - readProperties.getRefReader(), - readProperties.defaultBaseQualities()); - } - - /** - * Adds this read to the given shard. - * @param shard The shard to which to add the read. - * @param id The id of the given reader. - * @param read The read to add to the shard. - */ - private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) { - SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing(); - shard.addRead(read); - readerPositions.put(id,endChunk); - } - /** * Filter reads based on user-specified criteria. * @@ -689,19 +655,6 @@ public SAMResourcePool(final int maxEntries) { this.maxEntries = maxEntries; } - /** - * Dangerous internal method; retrieves any set of readers, whether in iteration or not. - * Used to handle non-exclusive, stateless operations, such as index queries. - * @return Any collection of SAMReaders, whether in iteration or not. - */ - protected SAMReaders getReadersWithoutLocking() { - synchronized(this) { - if(allResources.size() == 0) - createNewResource(); - } - return allResources.get(0); - } - /** * Choose a set of readers from the pool to use for this query. When complete, * @return @@ -748,31 +701,154 @@ private synchronized void createNewResource() { * A collection of readers derived from a reads metadata structure. */ private class SAMReaders implements Iterable { + /** + * Cached representation of the merged header used to generate a merging iterator. + */ + private final SamFileHeaderMerger headerMerger; + /** * Internal storage for a map of id -> reader. */ private final Map readers = new LinkedHashMap(); + /** + * The inptu streams backing + */ + private final Map inputStreams = new LinkedHashMap(); + /** * Derive a new set of readers from the Reads metadata. * @param readerIDs reads to load. * @param validationStringency validation stringency. */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { - for(SAMReaderID readerID: readerIDs) { - SAMFileReader reader = new SAMFileReader(readerID.samFile); - reader.setSAMRecordFactory(factory); - reader.enableFileSource(true); - reader.enableIndexMemoryMapping(false); - if(!enableLowMemorySharding) - reader.enableIndexCaching(true); - reader.setValidationStringency(validationStringency); - - final SAMFileHeader header = reader.getFileHeader(); - logger.debug(String.format("Sort order is: " + header.getSortOrder())); - - readers.put(readerID,reader); + final int totalNumberOfFiles = readerIDs.size(); + int readerNumber = 1; + final SimpleTimer timer = new SimpleTimer().start(); + + if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords " + (USE_PARALLEL_LOADING ? "in parallel" : "in serial")); + if ( ! USE_PARALLEL_LOADING ) { + final int tickSize = 50; + int nExecutedTotal = 0; + long lastTick = timer.currentTime(); + for(final SAMReaderID readerID: readerIDs) { + final ReaderInitializer init = new ReaderInitializer(readerID).call(); + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); + readers.put(init.readerID,init.reader); + if ( ++nExecutedTotal % tickSize == 0) { + double tickInSec = (timer.currentTime() - lastTick) / 1000.0; + printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); + lastTick = timer.currentTime(); + } + } + } else { + final int N_THREADS = 8; + + final ExecutorService executor = Executors.newFixedThreadPool(N_THREADS); + final List inits = new ArrayList(totalNumberOfFiles); + Queue> futures = new LinkedList>(); + for (final SAMReaderID readerID: readerIDs) { + logger.debug("Enqueuing for initialization: " + readerID.samFile); + final ReaderInitializer init = new ReaderInitializer(readerID); + inits.add(init); + futures.add(executor.submit(init)); + } + + try { + final int MAX_WAIT = 30 * 1000; + final int MIN_WAIT = 1 * 1000; + + while ( ! futures.isEmpty() ) { + final int prevSize = futures.size(); + final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file + final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT)); + Thread.sleep(waitTimeInMS); + + Queue> pending = new LinkedList>(); + for ( final Future initFuture : futures ) { + if ( initFuture.isDone() ) { + final ReaderInitializer init = initFuture.get(); + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID)); + readers.put(init.readerID, init.reader); + } else { + pending.add(initFuture); + } + } + + final int nExecutedTotal = totalNumberOfFiles - pending.size(); + final int nExecutedInTick = prevSize - pending.size(); + printReaderPerformance(nExecutedTotal, nExecutedInTick, totalNumberOfFiles, timer, waitTimeInMS / 1000.0); + futures = pending; + } + } catch ( InterruptedException e ) { + throw new ReviewedStingException("Interrupted SAMReader initialization", e); + } catch ( ExecutionException e ) { + throw new ReviewedStingException("Execution exception during SAMReader initialization", e); + } + + executor.shutdown(); } + + if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); + + Collection headers = new LinkedList(); + for(SAMFileReader reader: readers.values()) + headers.add(reader.getFileHeader()); + headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); + } + + final private void printReaderPerformance(final int nExecutedTotal, + final int nExecutedInTick, + final int totalNumberOfFiles, + final SimpleTimer timer, + final double tickDurationInSec) { + final int pendingSize = totalNumberOfFiles - nExecutedTotal; + final double totalTimeInSeconds = timer.getElapsedTime(); + final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); + final int nRemaining = pendingSize; + final double estTimeToComplete = pendingSize / nTasksPerSecond; + logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", + nExecutedInTick, tickDurationInSec, + nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, + nRemaining, estTimeToComplete, estTimeToComplete / 60)); + } + + /** + * Return the header derived from the merging of these BAM files. + * @return the merged header. + */ + public SAMFileHeader getMergedHeader() { + return headerMerger.getMergedHeader(); + } + + /** + * Do multiple read groups collide in this dataset? + * @return True if multiple read groups collide; false otherwis. + */ + public boolean hasReadGroupCollisions() { + return headerMerger.hasReadGroupCollisions(); + } + + /** + * Get the newly mapped read group ID for the given read group. + * @param readerID Reader for which to discern the transformed ID. + * @param originalReadGroupID Original read group. + * @return Remapped read group. + */ + public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) { + SAMFileHeader header = readers.get(readerID).getFileHeader(); + return headerMerger.getReadGroupId(header,originalReadGroupID); + } + + public MergingSamRecordIterator createMergingIterator() { + return new MergingSamRecordIterator(headerMerger,readers.values(),true); } /** @@ -786,6 +862,15 @@ public SAMFileReader getReader(SAMReaderID id) { return readers.get(id); } + /** + * Retrieve the input stream backing a reader. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public BlockInputStream getInputStream(final SAMReaderID id) { + return inputStreams.get(id); + } + /** * Searches for the reader id of this reader. * @param reader Reader for which to search. @@ -815,24 +900,29 @@ public Iterator iterator() { public boolean isEmpty() { return readers.isEmpty(); } + } - /** - * Gets all the actual readers out of this data structure. - * @return A collection of the readers. - */ - public Collection values() { - return readers.values(); + class ReaderInitializer implements Callable { + final SAMReaderID readerID; + BlockInputStream blockInputStream = null; + SAMFileReader reader; + + public ReaderInitializer(final SAMReaderID readerID) { + this.readerID = readerID; } - /** - * Gets all the actual readers out of this data structure. - * @return A collection of the readers. - */ - public Collection headers() { - ArrayList headers = new ArrayList(readers.size()); - for (SAMFileReader reader : values()) - headers.add(reader.getFileHeader()); - return headers; + public ReaderInitializer call() { + final File indexFile = findIndexFile(readerID.samFile); + if (threadAllocation.getNumIOThreads() > 0) { + blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(blockInputStream,indexFile,false); + } + else + reader = new SAMFileReader(readerID.samFile,indexFile,false); + reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); + reader.setValidationStringency(validationStringency); + return this; } } @@ -883,7 +973,7 @@ private class ReadGroupMapping extends HashMap {} * Filters out reads that do not overlap the current GenomeLoc. * Note the custom implementation: BAM index querying returns all reads that could * possibly overlap the given region (and quite a few extras). In order not to drag - * down performance, this implementation is highly customized to its task. + * down performance, this implementation is highly customized to its task. */ private class IntervalOverlapFilteringIterator implements CloseableIterator { /** @@ -903,7 +993,7 @@ private class IntervalOverlapFilteringIterator implements CloseableIterator iterator, L i++; } } - + advance(); } @@ -1018,12 +1108,12 @@ private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { return // Read ends on a later contig, or... read.getReferenceIndex() > intervalContigIndices[currentBound] || - // Read ends of this contig... - (read.getReferenceIndex() == intervalContigIndices[currentBound] && - // either after this location, or... - (read.getAlignmentEnd() >= intervalStarts[currentBound] || - // read is unmapped but positioned and alignment start is on or after this start point. - (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); } /** @@ -1035,8 +1125,8 @@ private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { return // Read starts on a prior contig, or... read.getReferenceIndex() < intervalContigIndices[currentBound] || - // Read starts on this contig and the alignment start is registered before this end point. - (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); } } @@ -1070,6 +1160,40 @@ private File findIndexFile(File bamFile) { return indexFile; } + + /** + * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream + * will be as granular as possible given our current knowledge of the best ways to split up BAM files. + * @return An iterator that spans all reads in all BAM files. + */ + public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any + * read that has been assigned + * @return + */ + public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Create a schedule for processing the initialized BAM file using the given interval list. + * The returned schedule should be as granular as possible. + * @param intervals The list of intervals for which to create the schedule. + * @return A granular iterator over file pointers. + */ + public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { + if(intervals == null) + throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided."); + shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser); + return shardBalancer; + } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java index c84db7770d..5eba5d84f6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java @@ -67,6 +67,7 @@ public Tags getTags() { * @param other The other identifier. * @return True iff the two readers point to the same file. */ + @Override public boolean equals(Object other) { if(other == null) return false; if(!(other instanceof SAMReaderID)) return false; @@ -79,10 +80,20 @@ public boolean equals(Object other) { * Generate a hash code for this object. * @return A hash code, based solely on the file name at this point. */ + @Override public int hashCode() { return samFile.hashCode(); } + /** + * Best string representation for a SAM file reader is the path of the source file. + */ + @Override + public String toString() { + return getSamFilePath(); + } + + @Override public int compareTo(Object other) { return this.samFile.getAbsolutePath().compareTo(((SAMReaderID)other).samFile.getAbsolutePath()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java new file mode 100644 index 0000000000..f9f6539a7f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; + +/** +* Created by IntelliJ IDEA. +* User: mhanna +* Date: 10/14/11 +* Time: 10:47 PM +* To change this template use File | Settings | File Templates. +*/ +class SAMReaderPosition { + private final SAMReaderID reader; + private final BlockInputStream inputStream; + + private final List positions; + private PeekableIterator positionIterator; + + /** + * Stores the next block address to read, or -1 if no such block is available. + */ + private long nextBlockAddress; + + + SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + this.reader = reader; + this.inputStream = inputStream; + + this.positions = fileSpan.getGATKChunks(); + initialize(); + } + + public SAMReaderID getReader() { + return reader; + } + + public BlockInputStream getInputStream() { + return inputStream; + } + + /** + * Retrieves the next block address to be read. + * @return Next block address to be read. + */ + public long getBlockAddress() { + return nextBlockAddress; + } + + public void reset() { + initialize(); + } + + /** + * Resets the SAM reader position to its original state. + */ + private void initialize() { + this.positionIterator = new PeekableIterator(positions.iterator()); + if(positionIterator.hasNext()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + else + nextBlockAddress = -1; + } + + /** + * Advances the current position to the next block to read, given the current position in the file. + * @param filePosition The current position within the file. + */ + void advancePosition(final long filePosition) { + nextBlockAddress = filePosition; + + // Check the current file position against the iterator; if the iterator is before the current file position, + // draw the iterator forward. Remember when performing the check that coordinates are half-open! + try { + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) { + positionIterator.next(); + // Check to see if the iterator has more data available. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) { + nextBlockAddress = positionIterator.peek().getBlockStart(); + break; + } + } + } + catch(Exception ex) { + throw new ReviewedStingException(""); + } + } + + private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { + return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java new file mode 100644 index 0000000000..962208086d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.Iterator; + +/** + * Balances maximally granular file pointers into shards of reasonable size. + */ +public abstract class ShardBalancer implements Iterable { + protected SAMDataSource readsDataSource; + protected PeekableIterator filePointers; + protected GenomeLocParser parser; + + public void initialize(final SAMDataSource readsDataSource, final Iterator filePointers, final GenomeLocParser parser) { + this.readsDataSource = readsDataSource; + this.filePointers = new PeekableIterator(filePointers); + this.parser = parser; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java deleted file mode 100644 index 989cf9fce1..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategy.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import java.util.Iterator; -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 4:55:37 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date Apr 10, 2009 - *

- * Interface ShardStrategy - *

- * The base interface for the sharding strategy; before we had a base abstract - * class, but not this will be an interface to accomidate read based sharding - */ -public interface ShardStrategy extends Iterator, Iterable { -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java deleted file mode 100644 index 780b41ef72..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardStrategyFactory.java +++ /dev/null @@ -1,117 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * - * User: aaron - * Date: Apr 6, 2009 - * Time: 7:09:22 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 6, 2009 - *

- * Class ShardStrategyFactory - *

- * The Shard Strategy Factory, use this class to create and transfer shard strategies - * between different approaches. - */ -public class ShardStrategyFactory { - public enum SHATTER_STRATEGY { - MONOLITHIC, // Put all of the available data into one shard. - LOCUS_EXPERIMENTAL, - READS_EXPERIMENTAL - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L); - } - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser,readsDataSource,null); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request"); - } - - } - - - /** - * get a new shatter strategy - * - * @param readsDataSource File pointer to BAM. - * @param referenceDataSource File pointer to reference. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return a shard strategy capable of dividing input data into shards. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) { - return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l); - - } - - /** - * get a new shatter strategy - * - * @param readsDataSource The reads used to shatter this file. - * @param referenceDataSource The reference used to shatter this file. - * @param strat what's our strategy - SHATTER_STRATEGY type - * @param dic the seq dictionary - * @param startingSize the starting size - * @return A strategy for shattering this data. - */ - static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) { - switch (strat) { - case LOCUS_EXPERIMENTAL: - return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst); - case READS_EXPERIMENTAL: - return new ReadShardStrategy(genomeLocParser, readsDataSource,lst); - default: - throw new ReviewedStingException("Strategy: " + strat + " isn't implemented"); - } - - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 673df6dfa4..577db0965f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -30,10 +30,12 @@ import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; -import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder; +import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -92,7 +94,7 @@ public int execute() throws IOException { // initialize reads List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); // intervals GenomeLocSortedSet intervalSortedSet = null; @@ -106,7 +108,7 @@ public int execute() throws IOException { logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); - LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); @@ -135,7 +137,7 @@ public int execute() throws IOException { logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); - sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); + sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index c8c79bb14c..4ecfe472df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -29,6 +29,13 @@ import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.sam.CreateSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; +import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -36,13 +43,17 @@ import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; /** * Loads reference data from fasta file * Looks for fai and dict files, and tries to create them if they don't exist */ public class ReferenceDataSource { - private IndexedFastaSequenceFile index; + private IndexedFastaSequenceFile reference; /** our log, which we want to capture anything from this class */ protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); @@ -173,7 +184,7 @@ public ReferenceDataSource(File fastaFile) { logger.info("Treating existing index file as complete."); } - index = new CachingIndexedFastaSequenceFile(fastaFile); + reference = new CachingIndexedFastaSequenceFile(fastaFile); } catch (IllegalArgumentException e) { throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); @@ -192,6 +203,103 @@ public ReferenceDataSource(File fastaFile) { * @return IndexedFastaSequenceFile that was created from file */ public IndexedFastaSequenceFile getReference() { - return this.index; + return this.reference; } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. + * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { + List shards = new ArrayList(); + for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { + for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { + final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); + shards.add(new LocusShard(parser, + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); + } + } + return shards; + } + + + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { + List shards = new ArrayList(); + + for(GenomeLoc interval: intervals) { + while(interval.size() > maxShardSize) { + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), + null)); + interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + } + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(interval), + null)); + } + + return shards; + } + + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param intervals the list of intervals to use when processing the reference. + * @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size. + * @return Creates a schedule for performing a traversal over the entire reference. + */ +/* + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) { + final List shards = new ArrayList(); + final GenomeLocParser parser = intervals.getGenomeLocParser(); + LinkedList currentIntervals = new LinkedList(); + + for(GenomeLoc interval: intervals) { + // if the next interval is too big, we can safely shard currentInterval and then break down this one + if (interval.size() > targetShardSize) { + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + while(interval.size() > targetShardSize) { + final GenomeLoc partialInterval = parser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getStart()+targetShardSize-1); + shards.add(createShardFromInterval(Collections.singletonList(partialInterval), readsDataSource, parser)); + interval = parser.createGenomeLoc(interval.getContig(), interval.getStart() + targetShardSize, interval.getStop()); + } + currentIntervals = new LinkedList(); + currentIntervals.add(interval); + } + // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly) + else { + if (currentIntervals.isEmpty()) { + currentIntervals.add(interval); + } + else { + if (currentIntervals.getLast().compareContigs(interval) != 0 || interval.getStop() - currentIntervals.getLast().getStart() + 1 > targetShardSize) { + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + currentIntervals = new LinkedList(); + } + currentIntervals.add(interval); + } + } + } + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + return shards; + } + + private static Shard createShardFromInterval(final List intervals, final SAMDataSource readsDataSource, final GenomeLocParser parser) { + //logger.debug("Adding shard " + interval); + return new LocusShard(parser, + readsDataSource, + intervals, + null); + } +*/ } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java index 18679dd770..5b4be2fc63 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java @@ -26,7 +26,6 @@ import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; @@ -154,16 +153,6 @@ public boolean matchesNameAndRecordType(String name, Type type) { return (name.equals(fileDescriptor.getName()) && (type.getClass().isAssignableFrom(getType().getClass()))); } - /** - * Seek to the specified position and return an iterator through the data. - * @param shard Shard that points to the selected position. - * @return Iterator through the data. - */ - public LocationAwareSeekableRODIterator seek( Shard shard ) { - DataStreamSegment dataStreamSegment = shard.getGenomeLocs().size() != 0 ? new MappedStreamSegment(shard.getGenomeLocs().get(0)) : new EntireStream(); - return iteratorPool.iterator(dataStreamSegment); - } - /** * Seek to the specified position and return an iterator through the data. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 162baed001..39e1bdc726 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; @@ -16,6 +15,7 @@ import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import java.util.Collection; +import java.util.Iterator; import java.util.LinkedList; import java.util.Queue; import java.util.concurrent.ExecutorService; @@ -42,7 +42,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar */ private ThreadLocalOutputTracker outputTracker = new ThreadLocalOutputTracker(); - private final Queue traverseTasks = new LinkedList(); private final Queue reduceTasks = new LinkedList(); /** @@ -50,6 +49,11 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar */ private Throwable error = null; + /** + * Queue of incoming shards. + */ + private Iterator traversalTasks; + /** * Keep a queue of shard traversals, and constantly monitor it to see what output * merge tasks remain. @@ -57,9 +61,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar */ private final Queue outputMergeTasks = new LinkedList(); - /** How many total tasks were in the queue at the start of run. */ - private int totalTraversals = 0; - /** How many shard traversals have run to date? */ private int totalCompletedTraversals = 0; @@ -88,18 +89,16 @@ protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); } - public Object execute( Walker walker, ShardStrategy shardStrategy ) { + public Object execute( Walker walker, Iterable shardStrategy ) { // Fast fail for walkers not supporting TreeReducible interface. if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); + this.traversalTasks = shardStrategy.iterator(); + ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); - for (Shard shard : shardStrategy) - traverseTasks.add(shard); - totalTraversals = traverseTasks.size(); - while (isShardTraversePending() || isTreeReducePending()) { // Check for errors during execution. if(hasTraversalErrorOccurred()) @@ -191,7 +190,7 @@ public OutputTracker getOutputTracker() { * @return true if a shard traversal is waiting; false otherwise. */ protected boolean isShardTraversePending() { - return traverseTasks.size() > 0; + return traversalTasks.hasNext(); } /** @@ -284,10 +283,10 @@ protected void mergeExistingOutput( boolean wait ) { * @param reduceTree Tree of reduces to which to add this shard traverse. */ protected void queueNextShardTraverse( Walker walker, ReduceTree reduceTree ) { - if (traverseTasks.size() == 0) + if (!traversalTasks.hasNext()) throw new IllegalStateException("Cannot traverse; no pending traversals exist."); - Shard shard = traverseTasks.remove(); + Shard shard = traversalTasks.next(); // todo -- add ownership claim here @@ -399,16 +398,6 @@ synchronized void reportTreeReduceTime( long treeReduceTime ) { } - /** {@inheritDoc} */ - public int getTotalNumberOfShards() { - return totalTraversals; - } - - /** {@inheritDoc} */ - public int getRemainingNumberOfShards() { - return traverseTasks.size(); - } - /** {@inheritDoc} */ public int getNumberOfTasksInReduceQueue() { return reduceTasks.size(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java index 21a87963b9..530285db02 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java @@ -17,18 +17,6 @@ * microscheduler is behaving. */ public interface HierarchicalMicroSchedulerMBean extends MicroSchedulerMBean { - /** - * What is the total number of shards assigned to this microscheduler? - * @return Total number of shards to process. - */ - public int getTotalNumberOfShards(); - - /** - * How many shards are remaining for this microscheduler to process? - * @return Remaining number of shards to process. - */ - public int getRemainingNumberOfShards(); - /** * How many tree reduces are waiting in the tree reduce queue? * @return Total number of reduces waiting in the tree reduce queue? diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index deafcd0cc0..ff5e1064bd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; @@ -44,7 +43,7 @@ protected LinearMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDa * @param walker Computation to perform over dataset. * @param shardStrategy A strategy for sharding the data. */ - public Object execute(Walker walker, ShardStrategy shardStrategy) { + public Object execute(Walker walker, Iterable shardStrategy) { walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index e731b9864d..d013db7e84 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -30,11 +30,11 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads the informations associated with the reads * @param reference the reference file * @param rods the rods to include in the traversal - * @param nThreadsToUse Number of threads to utilize. + * @param threadAllocation Number of threads to utilize. * * @return The best-fit microscheduler. */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse) { - if (walker instanceof TreeReducible && nThreadsToUse > 1) { + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { if(walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); if(walker instanceof ReadWalker) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse)); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads()); } else { - if(nThreadsToUse > 1) + if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods); } @@ -156,7 +156,7 @@ protected MicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSour * * @return the return type of the walker */ - public abstract Object execute(Walker walker, ShardStrategy shardStrategy); + public abstract Object execute(Walker walker, Iterable shardStrategy); /** * Retrieves the object responsible for tracking and managing output. diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java index 11bbf9e4c5..37a6cfc36d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java @@ -50,19 +50,20 @@ public void initialize(GenomeAnalysisEngine engine) { public boolean filterOut(SAMRecord read) { // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided - if (!checkMismatchingBasesAndQuals(read)) { - if (!filterMismatchingBaseAndQuals) - throw new UserException.MalformedBAM(read, "BAM file has a read with mismatching number of bases and base qualities. Offender: " + read.getReadName() +" [" + read.getReadLength() + " bases] [" +read.getBaseQualities().length +"] quals"); - else - return true; - } - return !checkInvalidAlignmentStart(read) || !checkInvalidAlignmentEnd(read) || !checkAlignmentDisagreesWithHeader(this.header,read) || + !checkHasReadGroup(read) || + !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) || !checkCigarDisagreesWithAlignment(read); } + private static boolean checkHasReadGroup(SAMRecord read) { + if ( read.getReadGroup() == null ) + throw new UserException.ReadMissingReadGroup(read); + return true; + } + /** * Check for the case in which the alignment start is inconsistent with the read unmapped flag. * @param read The read to validate. @@ -127,7 +128,15 @@ private static boolean checkCigarDisagreesWithAlignment(SAMRecord read) { * @param read the read to validate * @return true if they have the same number. False otherwise. */ - private static boolean checkMismatchingBasesAndQuals(SAMRecord read) { - return (read.getReadLength() == read.getBaseQualities().length); + private static boolean checkMismatchingBasesAndQuals(SAMRecord read, boolean filterMismatchingBaseAndQuals) { + boolean result; + if (read.getReadLength() == read.getBaseQualities().length) + result = true; + else if (filterMismatchingBaseAndQuals) + result = false; + else + throw new UserException.MalformedBAM(read, String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals]", read.getReadName(), read.getReadLength(), read.getBaseQualities().length)); + + return result; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/BufferingReadIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/BufferingReadIterator.java deleted file mode 100644 index 7eaf4be410..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/BufferingReadIterator.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.iterators; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.LinkedList; -import java.util.NoSuchElementException; -import java.util.Queue; - -/** - * Buffers access to a large stream of reads, replenishing the buffer only when the reads - * - * @author mhanna - * @version 0.1 - */ -public class BufferingReadIterator implements CloseableIterator { - private final CloseableIterator wrappedIterator; - private final Queue buffer; - private final int bufferSize; - - public BufferingReadIterator(final CloseableIterator readIterator, final int bufferSize) { - this.wrappedIterator = readIterator; - this.buffer = new LinkedList(); - this.bufferSize = bufferSize; - } - - public boolean hasNext() { - assureBufferFull(); - return !buffer.isEmpty(); - } - - public SAMRecord next() { - assureBufferFull(); - if(!hasNext()) throw new NoSuchElementException("No next element available"); - return buffer.remove(); - } - - public void close() { - wrappedIterator.close(); - } - - public void remove() { - throw new ReviewedStingException("Unable to remove from a BufferingReadIterator"); - } - - /** - * If the buffer is empty but there are more elements in the iterator, - */ - private void assureBufferFull() { - if(!buffer.isEmpty()) - return; - while(buffer.size() < bufferSize && wrappedIterator.hasNext()) - buffer.add(wrappedIterator.next()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index ee3ea63ebe..75e787e05a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReservoirDownsampler; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -432,7 +431,7 @@ private void lazyLoadNextAlignmentContext() { while(iterator.hasNext()) { SAMRecordState state = iterator.next(); if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { - if ( filterBaseInRead(state.getRead(), location.getStart()) ) { + if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) { //discarded_bases++; //printStatus("Adaptor bases", discarded_adaptor_bases); continue; @@ -481,8 +480,8 @@ private boolean readIsPastCurrentPosition(SAMRecord read) { * @param pos * @return */ - private static boolean filterBaseInRead(SAMRecord rec, long pos) { - return ReadUtils.readPairBaseOverlapType(rec, pos) == ReadUtils.OverlapType.IN_ADAPTOR; + private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); } private void updateReadStates() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 7bf518fd5e..09ae02bd91 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -6,10 +6,9 @@ import org.broad.tribble.dbsnp.OldDbSNPFeature; import org.broad.tribble.gelitext.GeliTextFeature; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -187,30 +186,23 @@ public VariantContext convert(String name, Object input, ReferenceContext ref) { } Map attributes = new HashMap(); - attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; if ( index < 0 ) return null; // we weren't given enough reference context to create the VariantContext Byte refBaseForIndel = new Byte(ref.getBases()[index]); - Map genotypes = null; - VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel); - return vc; + final VariantContextBuilder builder = new VariantContextBuilder(); + builder.source(name).id(dbsnp.getRsID()); + builder.loc(dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0)); + builder.alleles(alleles); + builder.referenceBaseForIndel(refBaseForIndel); + return builder.make(); } else return null; // can't handle anything else } } - public static VCFHeader createVCFHeader(Set hInfo, VariantContext vc) { - HashSet names = new LinkedHashSet(); - for ( Genotype g : vc.getGenotypesSortedByName() ) { - names.add(g.getSampleName()); - } - - return new VCFHeader(hInfo == null ? new HashSet() : hInfo, names); - } - // -------------------------------------------------------------------------------------------------------------- // // GELI to VariantContext @@ -257,20 +249,15 @@ public VariantContext convert(String name, Object input, ReferenceContext ref) { else genotypeAlleles.add(refAllele); } - Map attributes = new HashMap(); + Map attributes = new HashMap(); Collection genotypes = new ArrayList(); - MutableGenotype call = new MutableGenotype(name, genotypeAlleles); - - // set the likelihoods, depth, and RMS mapping quality values - //call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY,geli.getLikelihoods()); - //call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY,geli.getMaximumMappingQual()); - //call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY,geli.getDepthOfCoverage()); + Genotype call = new Genotype(name, genotypeAlleles); // add the call to the genotype list, and then use this list to create a VariantContext genotypes.add(call); alleles.add(refAllele); - VariantContext vc = VariantContextUtils.toVC(name, ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()), alleles, genotypes, geli.getLODBestToReference(), null, attributes); - return vc; + GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()); + return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make(); } else return null; // can't handle anything else } @@ -329,7 +316,7 @@ public VariantContext convert(String name, Object input, ReferenceContext ref) { String[] samples = hapmap.getSampleIDs(); String[] genotypeStrings = hapmap.getGenotypes(); - Map genotypes = new HashMap(samples.length); + GenotypesContext genotypes = GenotypesContext.create(samples.length); for ( int i = 0; i < samples.length; i++ ) { // ignore bad genotypes if ( genotypeStrings[i].contains("N") ) @@ -358,16 +345,13 @@ public VariantContext convert(String name, Object input, ReferenceContext ref) { } Genotype g = new Genotype(samples[i], myAlleles); - genotypes.put(samples[i], g); + genotypes.add(g); } - HashMap attrs = new HashMap(1); - attrs.put(VariantContext.ID_KEY, hapmap.getName()); - long end = hapmap.getEnd(); if ( deletionLength > 0 ) end += deletionLength; - VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs, refBaseForIndel); + VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).referenceBaseForIndel(refBaseForIndel).make(); return vc; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java index c99aea254d..fcd85fd1d4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java @@ -30,16 +30,12 @@ import org.broad.tribble.FeatureCodec; import org.broad.tribble.NameAwareCodec; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.GATKDocUtils; -import org.broadinstitute.sting.utils.help.HelpUtils; -import javax.mail.Header; import java.io.File; import java.util.*; @@ -159,10 +155,8 @@ public FeatureDescriptor getByName(String name) { public FeatureDescriptor getByFiletype(File file) { List canParse = new ArrayList(); for ( FeatureDescriptor descriptor : featureDescriptors ) - if ( descriptor.getCodec() instanceof SelfScopingFeatureCodec ) { - if ( ((SelfScopingFeatureCodec) descriptor.getCodec()).canDecode(file) ) { - canParse.add(descriptor); - } + if ( descriptor.getCodec().canDecode(file.getPath()) ) { + canParse.add(descriptor); } if ( canParse.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 347e870c88..5a6490afe7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -1,28 +1,34 @@ package org.broadinstitute.sting.gatk.report; -import java.util.TreeMap; +import org.apache.commons.lang.math.NumberUtils; + +import java.util.*; /** * Holds values for a column in a GATK report table */ public class GATKReportColumn extends TreeMap { - private String columnName; - private Object defaultValue; - private boolean display; + final private String columnName; + final private Object defaultValue; + final private String format; + final private boolean display; /** * Construct the column object, specifying the column name, default value, and whether or not the column should be displayed * * @param columnName the name of the column * @param defaultValue the default value of the column - * @param display if true, the column will be displayed in the final output + * @param display if true, the column will be displayed in the final output + * @param format format string */ - public GATKReportColumn(String columnName, Object defaultValue, boolean display) { + public GATKReportColumn(String columnName, Object defaultValue, boolean display, String format) { this.columnName = columnName; this.defaultValue = defaultValue; this.display = display; + this.format = format == null ? null : (format.equals("") ? null : format); } + /** * Initialize an element in the column with a default value * @@ -55,7 +61,7 @@ private Object getWithoutSideEffects(Object primaryKey) { * @return the string value at the specified position in the column, or the default value if the element is not set */ public String getStringValue(Object primaryKey) { - return toString(getWithoutSideEffects(primaryKey)); + return formatValue(getWithoutSideEffects(primaryKey)); } /** @@ -70,22 +76,47 @@ public boolean isDisplayable() { /** * Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed width. - * @return the width of this column + * @return the format string for this column */ - public int getColumnWidth() { + public GATKReportColumnFormat getColumnFormat() { int maxWidth = columnName.length(); + GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT; for (Object obj : this.values()) { if (obj != null) { - int width = toString(obj).length(); + String formatted = formatValue(obj); + int width = formatted.length(); if (width > maxWidth) { maxWidth = width; } + + if (alignment == GATKReportColumnFormat.Alignment.RIGHT) { + if (!isRightAlign(formatted)) { + alignment = GATKReportColumnFormat.Alignment.LEFT; + } + } } } - return maxWidth; + return new GATKReportColumnFormat(maxWidth, alignment); + } + + private static final Collection RIGHT_ALIGN_STRINGS = Arrays.asList( + "null", + "NA", + String.valueOf(Double.POSITIVE_INFINITY), + String.valueOf(Double.NEGATIVE_INFINITY), + String.valueOf(Double.NaN)); + + /** + * Check if the value can be right aligned. Does not trim the values before checking if numeric since it assumes + * the spaces mean that the value is already padded. + * @param value to check + * @return true if the value is a right alignable + */ + protected static boolean isRightAlign(String value) { + return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value); } /** @@ -93,10 +124,12 @@ public int getColumnWidth() { * @param obj The object to convert to a string * @return The string representation of the column */ - private static String toString(Object obj) { + private String formatValue(Object obj) { String value; if (obj == null) { value = "null"; + } else if ( format != null ) { + value = String.format(format, obj); } else if (obj instanceof Float) { value = String.format("%.8f", (Float) obj); } else if (obj instanceof Double) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/SelfScopingFeatureCodec.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java similarity index 55% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/SelfScopingFeatureCodec.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java index de781b8391..6d19a83aa4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/SelfScopingFeatureCodec.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java @@ -22,27 +22,41 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata; - -import java.io.File; +package org.broadinstitute.sting.gatk.report; /** - * An interface marking that a given Tribble codec can look at the file and determine whether the - * codec specifically parsing the contents of the file. + * Column width and left/right alignment. */ -public interface SelfScopingFeatureCodec { - /** - * This function returns true iff the File potentialInput can be parsed by this - * codec. - * - * The GATK assumes that there's never a situation where two SelfScopingFeaetureCodecs - * return true for the same file. If this occurs the GATK splits out an error. - * - * Note this function must never throw an error. All errors should be trapped - * and false returned. - * - * @param potentialInput the file to test for parsiability with this codec - * @return true if potentialInput can be parsed, false otherwise - */ - public boolean canDecode(final File potentialInput); +public class GATKReportColumnFormat { + public static enum Alignment { LEFT, RIGHT } + public int width; + public Alignment alignment; + + public GATKReportColumnFormat(int width, Alignment alignment) { + this.width = width; + this.alignment = alignment; + } + + public int getWidth() { + return width; + } + + public Alignment getAlignment() { + return alignment; + } + + public String getNameFormat() { + return "%-" + width + "s"; + } + + public String getValueFormat() { + switch (alignment) { + case LEFT: + return "%-" + width + "s"; + case RIGHT: + return "%" + width + "s"; + default: + throw new UnsupportedOperationException("Unknown alignment: " + alignment); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 2fd5ad7e32..b72b20e0b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -250,13 +250,12 @@ private Object findPrimaryKey(Object[] columnValues) { * @param defaultValue the default value for the column */ public void addColumn(String columnName, Object defaultValue) { - if (!isValidName(columnName)) { - throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed."); - } - - addColumn(columnName, defaultValue, true); + addColumn(columnName, defaultValue, null); } + public void addColumn(String columnName, Object defaultValue, String format) { + addColumn(columnName, defaultValue, true, format); + } /** * Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file. * @@ -265,7 +264,14 @@ public void addColumn(String columnName, Object defaultValue) { * @param display if true - the column will be displayed; if false - the column will be hidden */ public void addColumn(String columnName, Object defaultValue, boolean display) { - columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display)); + addColumn(columnName, defaultValue, display, null); + } + + public void addColumn(String columnName, Object defaultValue, boolean display, String format) { + if (!isValidName(columnName)) { + throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed."); + } + columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display, format)); } /** @@ -602,12 +608,9 @@ public int getPrimaryKeyColumnWidth() { */ public void write(PrintStream out) { // Get the column widths for everything - HashMap columnWidths = new HashMap(); + HashMap columnFormats = new HashMap(); for (String columnName : columns.keySet()) { - int width = columns.get(columnName).getColumnWidth(); - String format = "%-" + String.valueOf(width) + "s"; - - columnWidths.put(columnName, format); + columnFormats.put(columnName, columns.get(columnName).getColumnFormat()); } String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; @@ -624,7 +627,7 @@ public void write(PrintStream out) { for (String columnName : columns.keySet()) { if (columns.get(columnName).isDisplayable()) { if (needsPadding) { out.printf(" "); } - out.printf(columnWidths.get(columnName), columnName); + out.printf(columnFormats.get(columnName).getNameFormat(), columnName); needsPadding = true; } @@ -644,7 +647,7 @@ public void write(PrintStream out) { if (columns.get(columnName).isDisplayable()) { if (needsPadding) { out.printf(" "); } String value = columns.get(columnName).getStringValue(primaryKey); - out.printf(columnWidths.get(columnName), value); + out.printf(columnFormats.get(columnName).getValueFormat(), value); needsPadding = true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java new file mode 100644 index 0000000000..0c81af07bb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.resourcemanagement; + +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Models how threads are distributed between various components of the GATK. + */ +public class ThreadAllocation { + /** + * The number of CPU threads to be used by the GATK. + */ + private final int numCPUThreads; + + /** + * Number of threads to devote exclusively to IO. Default is 0. + */ + private final int numIOThreads; + + public int getNumCPUThreads() { + return numCPUThreads; + } + + public int getNumIOThreads() { + return numIOThreads; + } + + /** + * Construct the default thread allocation. + */ + public ThreadAllocation() { + this(1,null,null); + } + + /** + * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. + * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). + * @param totalThreads Complete number of threads to allocate. + * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numIOThreads Total number of threads allocated exclusively to IO. + */ + public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) { + // If no allocation information is present, allocate all threads to CPU + if(numCPUThreads == null && numIOThreads == null) { + this.numCPUThreads = totalThreads; + this.numIOThreads = 0; + } + // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). + else if(numIOThreads == null) { + if(numCPUThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = totalThreads - numCPUThreads; + } + // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). + else if(numCPUThreads == null) { + if(numIOThreads > totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); + this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); + this.numIOThreads = numIOThreads; + } + else { + if(numCPUThreads + numIOThreads != totalThreads) + throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); + this.numCPUThreads = numCPUThreads; + this.numIOThreads = numIOThreads; + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index b39fdd79d6..a14d999eac 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -3,6 +3,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; @@ -110,6 +111,17 @@ public Sample getFather() { return infoDB.getSample(paternalID); } + public ArrayList getParents(){ + ArrayList parents = new ArrayList(2); + Sample parent = getMother(); + if(parent != null) + parents.add(parent); + parent = getFather(); + if(parent != null) + parents.add(parent); + return parents; + } + /** * Get gender of the sample * @return property of key "gender" - must be of type Gender diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index ee0873c6ed..a6f6b34816 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -142,20 +142,75 @@ public final Set getFamilyIDs() { * @return */ public final Map> getFamilies() { + return getFamilies(null); + } + + /** + * Returns a map from family ID -> set of family members for all samples in sampleIds with + * non-null family ids + * + * @param sampleIds - all samples to include. If null is passed then all samples are returned. + * @return + */ + public final Map> getFamilies(Collection sampleIds) { final Map> families = new TreeMap>(); for ( final Sample sample : samples.values() ) { - final String famID = sample.getFamilyID(); - if ( famID != null ) { - if ( ! families.containsKey(famID) ) - families.put(famID, new TreeSet()); - families.get(famID).add(sample); + if(sampleIds == null || sampleIds.contains(sample.getID())){ + final String famID = sample.getFamilyID(); + if ( famID != null ) { + if ( ! families.containsKey(famID) ) + families.put(famID, new TreeSet()); + families.get(famID).add(sample); + } } } - return families; } + + /** + * Returns the set of all children that have both of their parents. + * Note that if a family is composed of more than 1 child, each child is + * returned. + * @return - all the children that have both of their parents + */ + public final Set getChildrenWithParents(){ + return getChildrenWithParents(false); + } + + /** + * Returns the set of all children that have both of their parents. + * Note that if triosOnly = false, a family is composed of more than 1 child, each child is + * returned. + * + * This method can be used wherever trios are needed + * + * @param triosOnly - if set to true, only strict trios are returned + * @return - all the children that have both of their parents + */ + public final Set getChildrenWithParents(boolean triosOnly) { + + Map> families = getFamilies(); + final Set childrenWithParents = new HashSet(); + Iterator sampleIterator; + + for ( Set familyMembers: families.values() ) { + if(triosOnly && familyMembers.size() != 3) + continue; + + sampleIterator = familyMembers.iterator(); + Sample sample; + while(sampleIterator.hasNext()){ + sample = sampleIterator.next(); + if(sample.getParents().size() == 2 && familyMembers.containsAll(sample.getParents())) + childrenWithParents.add(sample); + } + + } + return childrenWithParents; + } + /** * Return all samples with a given family ID * @param familyId diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index fd691735f7..4ef2555243 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -121,7 +121,7 @@ public String toString() { private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? - private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds + private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index d1148cbd5c..74d8a81801 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -38,12 +38,11 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.clipreads.ClippingOp; -import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; -import org.broadinstitute.sting.utils.clipreads.ReadClipper; +import org.broadinstitute.sting.utils.clipping.ClippingOp; +import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; import java.io.PrintStream; @@ -299,9 +298,8 @@ private void addSeqToClip(String name, byte[] bases) { */ public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) { - if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES ) { - read = ReadUtils.replaceSoftClipsWithMatches(read); - } + if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES ) + read = ReadClipper.revertSoftClippedBases(read); ReadClipperWithData clipper = new ReadClipperWithData(read, sequencesToClip); // diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java index 31ea637255..baaaf9e28f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/SplitSamFileWalker.java @@ -81,7 +81,7 @@ public Map reduceInit() { for ( SAMReadGroupRecord readGroup : this.getToolkit().getSAMFileHeader().getReadGroups()) { final String sample = readGroup.getSample(); if ( ! headers.containsKey(sample) ) { - SAMFileHeader header = ReadUtils.copySAMFileHeader(this.getToolkit().getSAMFileHeader()); + SAMFileHeader header = duplicateSAMFileHeader(this.getToolkit().getSAMFileHeader()); logger.debug(String.format("Creating BAM header for sample %s", sample)); ArrayList readGroups = new ArrayList(); header.setReadGroups(readGroups); @@ -121,4 +121,20 @@ public Map reduce(SAMRecord read, Map e : toCopy.getAttributes()) + copy.setAttribute(e.getKey(), e.getValue()); + + return copy; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 792fef9c32..6264808f4b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -88,7 +88,7 @@ protected SAMSequenceDictionary getMasterSequenceDictionary() { return getToolkit().getMasterSequenceDictionary(); } - protected SampleDB getSampleDB() { + public SampleDB getSampleDB() { return getToolkit().getSampleDB(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index e5f75f06d9..833107bd36 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -54,22 +55,22 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati if ( !vc.isBiallelic() ) return null; - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( !vc.hasGenotypes() ) return null; double ratio = 0.0; double totalWeights = 0.0; - for ( Map.Entry genotype : genotypes.entrySet() ) { + for ( Genotype genotype : genotypes ) { // we care only about het calls - if ( !genotype.getValue().isHet() ) + if ( !genotype.isHet() ) continue; - AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; - if ( vc.isSNP() ) { + if ( vc.isSNP() && context.hasBasePileup() ) { final String bases = new String(context.getBasePileup().getBases()); if ( bases.length() == 0 ) return null; @@ -84,8 +85,8 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati continue; // weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much - ratio += genotype.getValue().getNegLog10PError() * ((double)refCount / (double)(refCount + altCount)); - totalWeights += genotype.getValue().getNegLog10PError(); + ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount)); + totalWeights += genotype.getLog10PError(); } else if ( vc.isIndel() && context.hasExtendedEventPileup() ) { final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); if ( indelPileup == null ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 820fd248ae..06e91bf264 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -51,6 +51,9 @@ private Double annotateSNP(AlignmentContext stratifiedContext, VariantContext vc if ( altAlleles.size() == 0 ) return null; + if ( !stratifiedContext.hasBasePileup() ) + return null; + final String bases = new String(stratifiedContext.getBasePileup().getBases()); if ( bases.length() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java index 46aa6d0f30..7612512590 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java @@ -59,6 +59,8 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati int[] counts = new int[4]; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + if ( !sample.getValue().hasBasePileup() ) + continue; for (byte base : sample.getValue().getBasePileup().getBases() ) { int index = BaseUtils.simpleBaseToBaseIndex(base); if ( index != -1 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 6cab6d95f4..312b505ec1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -14,7 +14,8 @@ /** - * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele) + * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele). + * Note that the base quality rank sum test can not be calculated for homozygous sites. */ public class BaseQualityRankSumTest extends RankSumTest { public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 5ed2a67613..0acd3e8418 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -59,10 +59,8 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( ! vc.hasGenotypes() ) return null; - - Map map = new HashMap(); - VariantContextUtils.calculateChromosomeCounts(vc, map, true); - return map; + + return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); } public List getKeyNames() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 8098de5b1b..ab38b69cda 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -49,5 +49,5 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Filtered Depth")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 2d1d1978c8..c4025a25c2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -46,7 +46,8 @@ /** * Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation * being seen on only the forward or only the reverse strand) in the reads? More bias is - * indicative of false positive calls. + * indicative of false positive calls. Note that the fisher strand test may not be + * calculated for certain complex indel cases or for multi-allelic sites. */ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation { private static final String FS = "FS"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index c142109faa..551f8e2cf4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -52,6 +52,8 @@ /** * Consistency of the site with two (and only two) segregating haplotypes. Higher scores * are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls. + * Note that the Haplotype Score is only calculated for sites with read coverage; also, for SNPs, the + * site must be bi-allelic. */ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation { private final static boolean DEBUG = false; @@ -87,9 +89,8 @@ else if (context.hasBasePileup()) final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); if (haplotypes != null) { - final Set> genotypes = vc.getGenotypes().entrySet(); - for ( final Map.Entry genotype : genotypes ) { - final AlignmentContext thisContext = stratifiedContexts.get(genotype.getKey()); + for ( final Genotype genotype : vc.getGenotypes()) { + final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); if ( thisContext != null ) { final ReadBackedPileup thisPileup; if (thisContext.hasExtendedEventPileup()) @@ -180,12 +181,12 @@ private List computeHaplotypes(final ReadBackedPileup pileup, final i final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); Listhlist = new ArrayList(); - hlist.add(new Haplotype(haplotype1.getBasesAsBytes(), 60)); + hlist.add(new Haplotype(haplotype1.getBases(), 60)); for (int k=1; k < haplotypesToCompute; k++) { Haplotype haplotype2 = consensusHaplotypeQueue.poll(); if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found - hlist.add(new Haplotype(haplotype2.getBasesAsBytes(), 20)); + hlist.add(new Haplotype(haplotype2.getBases(), 20)); } return hlist; } else @@ -229,8 +230,8 @@ private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextS } private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplotype haplotypeB) { - final byte[] a = haplotypeA.getBasesAsBytes(); - final byte[] b = haplotypeB.getBasesAsBytes(); + final byte[] a = haplotypeA.getBases(); + final byte[] b = haplotypeB.getBases(); if (a.length != b.length) { throw new ReviewedStingException("Haplotypes a and b must be of same length"); @@ -313,7 +314,7 @@ private double scoreReadAgainstHaplotype(final PileupElement p, final int contex // actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != c, then // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n - final byte[] haplotypeBases = haplotype.getBasesAsBytes(); + final byte[] haplotypeBases = haplotype.getBases(); final SAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index f068ed895e..795cdbeb54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -26,20 +27,18 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; - private static final int MIN_NEG_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; + private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; int refCount = 0; int hetCount = 0; int homCount = 0; - for ( Map.Entry genotype : genotypes.entrySet() ) { - Genotype g = genotype.getValue(); - + for ( final Genotype g : genotypes ) { if ( g.isNoCall() ) continue; @@ -47,7 +46,7 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati // Right now we just ignore genotypes that are not confident, but this throws off // our HW ratios. More analysis is needed to determine the right thing to do when // the genotyper cannot decide whether a given sample is het or hom var. - if ( g.getNegLog10PError() < MIN_NEG_LOG10_PERROR ) + if ( g.getLog10PError() > MIN_LOG10_PERROR ) continue; if ( g.isHomRef() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index a14007147d..640ab036b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -23,7 +24,8 @@ * * A continuous generalization of the Hardy-Weinberg test for disequilibrium that works * well with limited coverage per sample. See the 1000 Genomes Phase I release for - * more information. + * more information. Note that the Inbreeding Coefficient will not be calculated for files + * with fewer than a minimum (generally 10) number of samples. */ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation { @@ -31,7 +33,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; @@ -50,8 +52,7 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati double hetCount = 0.0; double homCount = 0.0; int N = 0; // number of samples that have likelihoods - for ( final Map.Entry genotypeMap : genotypes.entrySet() ) { - Genotype g = genotypeMap.getValue(); + for ( final Genotype g : genotypes ) { if ( g.isNoCall() || !g.hasLikelihoods() ) continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index bd0d4e3fba..b9e6a5b2bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -3,22 +3,18 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -30,23 +26,26 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation { private MendelianViolation mendelianViolation = null; + private String motherId; + private String fatherId; + private String childId; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( mendelianViolation == null ) { - if ( walker instanceof VariantAnnotator && ((VariantAnnotator) walker).familyStr != null) { - mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).familyStr, ((VariantAnnotator)walker).minGenotypeQualityP ); + if (checkAndSetSamples(((VariantAnnotator) walker).getSampleDB())) { + mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid Family String file (-family) on the command line."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); } } Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(mendelianViolation.getSampleChild()) && vc.getGenotype(mendelianViolation.getSampleChild()).hasLikelihoods() && - vc.hasGenotype(mendelianViolation.getSampleDad()) && vc.getGenotype(mendelianViolation.getSampleDad()).hasLikelihoods() && - vc.hasGenotype(mendelianViolation.getSampleMom()) && vc.getGenotype(mendelianViolation.getSampleMom()).hasLikelihoods(); + boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && + vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && + vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc)); + toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); return toRet; } @@ -55,4 +54,27 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati public List getKeyNames() { return Arrays.asList("MVLR"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } + + private boolean checkAndSetSamples(SampleDB db){ + Set families = db.getFamilyIDs(); + if(families.size() != 1) + return false; + + Set family = db.getFamily(families.iterator().next()); + if(family.size() != 3) + return false; + + Iterator sampleIter = family.iterator(); + Sample sample; + for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ + if(sample.getParents().size()==2){ + motherId = sample.getMaternalID(); + fatherId = sample.getPaternalID(); + childId = sample.getID(); + return true; + } + } + return false; + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 157c615d70..9857c339f3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -16,6 +16,7 @@ /** * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) + * Note that the mapping quality rank sum test can not be calculated for homozygous sites. */ public class MappingQualityRankSumTest extends RankSumTest { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index ffc8529034..6638fc7a8e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -19,7 +20,8 @@ /** * Variant confidence (given as (AB+BB)/AA from the PLs) / unfiltered depth. * - * Low scores are indicative of false positive calls and artifacts. + * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing + * reads associated with the samples with polymorphic genotypes. */ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { @@ -27,19 +29,19 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati if ( stratifiedContexts.size() == 0 ) return null; - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) return null; int depth = 0; - for ( Map.Entry genotype : genotypes.entrySet() ) { + for ( final Genotype genotype : genotypes ) { // we care only about variant calls with likelihoods - if ( genotype.getValue().isHomRef() ) + if ( !genotype.isHet() && !genotype.isHomVar() ) continue; - AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; @@ -49,7 +51,7 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati if ( depth == 0 ) return null; - double QD = 10.0 * vc.getNegLog10PError() / (double)depth; + double QD = -10.0 * vc.getLog10PError() / (double)depth; Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 93e0932487..c5a2df1fd5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; @@ -32,7 +33,7 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati if ( stratifiedContexts.size() == 0 ) return null; - final Map genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) return null; @@ -42,8 +43,8 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati if (vc.isSNP() && vc.isBiallelic()) { // todo - no current support for multiallelic snps - for ( final Map.Entry genotype : genotypes.entrySet() ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) { continue; } @@ -52,8 +53,8 @@ public Map annotate(RefMetaDataTracker tracker, AnnotatorCompati } else if (vc.isIndel() || vc.isMixed()) { - for ( final Map.Entry genotype : genotypes.entrySet() ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) { continue; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 27a9306d45..d762af4284 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -20,6 +20,7 @@ /** * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). + * Note that the read position rank sum test can not be calculated for homozygous sites. */ public class ReadPosRankSumTest extends RankSumTest { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index ee08cfa3b8..cbf536e4f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -47,11 +47,11 @@ public class SampleList extends InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( vc.isMonomorphic() || !vc.hasGenotypes() ) + if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() ) return null; StringBuffer samples = new StringBuffer(); - for ( Genotype genotype : vc.getGenotypesSortedByName() ) { + for ( Genotype genotype : vc.getGenotypesOrderedByName() ) { if ( genotype.isCalled() && !genotype.isHomRef() ){ if ( samples.length() > 0 ) samples.append(","); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 85977bf8e9..5d215603a0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // We refuse to parse SnpEff output files generated by unsupported versions, or // lacking a SnpEff version number in the VCF header: - public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; @@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public enum InfoFieldKey { EFFECT_KEY ("SNPEFF_EFFECT", -1), IMPACT_KEY ("SNPEFF_IMPACT", 0), - CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), - AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), - GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), - GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), - TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), - EXON_ID_KEY ("SNPEFF_EXON_ID", 7), - FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1); + FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7), + EXON_ID_KEY ("SNPEFF_EXON_ID", 8); // Actual text of the key private final String keyName; @@ -110,70 +110,53 @@ public int getFieldIndex() { // are validated against this list. public enum EffectType { // High-impact effects: - FRAME_SHIFT (EffectFunctionalClass.NONE, false), - STOP_GAINED (EffectFunctionalClass.NONSENSE, false), - START_LOST (EffectFunctionalClass.NONE, false), - SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false), - SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false), - EXON_DELETED (EffectFunctionalClass.NONE, false), - STOP_LOST (EffectFunctionalClass.NONE, false), + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + EXON_DELETED, + FRAME_SHIFT, + STOP_GAINED, + STOP_LOST, // Moderate-impact effects: - NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false), - CODON_CHANGE (EffectFunctionalClass.NONE, false), - CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_DELETION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false), - UTR_5_DELETED (EffectFunctionalClass.NONE, false), - UTR_3_DELETED (EffectFunctionalClass.NONE, false), + NON_SYNONYMOUS_CODING, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + UTR_5_DELETED, + UTR_3_DELETED, // Low-impact effects: - SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - START_GAINED (EffectFunctionalClass.NONE, false), + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + START_GAINED, + SYNONYMOUS_CODING, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, // Modifiers: - NONE (EffectFunctionalClass.NONE, true), - CHROMOSOME (EffectFunctionalClass.NONE, true), - INTERGENIC (EffectFunctionalClass.NONE, true), - UPSTREAM (EffectFunctionalClass.NONE, true), - UTR_5_PRIME (EffectFunctionalClass.NONE, true), - CDS (EffectFunctionalClass.NONE, true), - GENE (EffectFunctionalClass.NONE, true), - TRANSCRIPT (EffectFunctionalClass.NONE, true), - EXON (EffectFunctionalClass.NONE, true), - INTRON (EffectFunctionalClass.NONE, true), - UTR_3_PRIME (EffectFunctionalClass.NONE, true), - DOWNSTREAM (EffectFunctionalClass.NONE, true), - INTRON_CONSERVED (EffectFunctionalClass.NONE, true), - INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true), - REGULATION (EffectFunctionalClass.NONE, true), - CUSTOM (EffectFunctionalClass.NONE, true), - WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true); - - private final EffectFunctionalClass functionalClass; - private final boolean isModifier; - - EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) { - this.functionalClass = functionalClass; - this.isModifier = isModifier; - } - - public EffectFunctionalClass getFunctionalClass() { - return functionalClass; - } - - public boolean isModifier() { - return isModifier; - } + NONE, + CHROMOSOME, + CUSTOM, + CDS, + GENE, + TRANSCRIPT, + EXON, + INTRON_CONSERVED, + UTR_5_PRIME, + UTR_3_PRIME, + DOWNSTREAM, + INTRAGENIC, + INTERGENIC, + INTERGENIC_CONSERVED, + UPSTREAM, + REGULATION, + INTRON } - // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of - // classifying some of the LOW impact effects as MODIFIERs. + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER. public enum EffectImpact { MODIFIER (0), LOW (1), @@ -202,7 +185,7 @@ public enum EffectCoding { UNKNOWN } - // We assign a functional class to each SnpEff effect. + // SnpEff assigns a functional class to each effect. public enum EffectFunctionalClass { NONE (0), SILENT (1), @@ -221,6 +204,11 @@ public boolean isHigherPriorityThan ( EffectFunctionalClass other ) { } public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set headerLines ) { + throw new UserException("SnpEff support is currently disabled in the GATK until SnpEff 2.0.4 is officially released " + + "due to a serious issue with SnpEff versions prior to 2.0.4. Please see this page for more details: " + + "http://www.broadinstitute.org/gsa/wiki/index.php/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator"); + + /* // Make sure that we actually have a valid SnpEff rod binding (just in case the user specified -A SnpEff // without providing a SnpEff rod via --snpEffFile): validateRodBinding(walker.getSnpEffRodBinding()); @@ -240,6 +228,7 @@ public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine // mistaken in the future for a SnpEff output file: headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_VERSION_LINE_KEY, snpEffVersionLine.getValue())); headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue())); + */ } public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { @@ -379,13 +368,13 @@ private SnpEffEffect getMostSignificantEffect ( List effects ) { public List getKeyNames() { return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), InfoFieldKey.GENE_NAME_KEY.getKeyName(), InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), - InfoFieldKey.EXON_ID_KEY.getKeyName(), - InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName() + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } @@ -393,13 +382,13 @@ public List getDescriptions() { return Arrays.asList( new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())), new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"), new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())) + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } @@ -409,6 +398,7 @@ public List getDescriptions() { protected static class SnpEffEffect { private EffectType effect; private EffectImpact impact; + private EffectFunctionalClass functionalClass; private String codonChange; private String aminoAcidChange; private String geneName; @@ -420,16 +410,21 @@ protected static class SnpEffEffect { private String parseError = null; private boolean isWellFormed = true; - private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11; - // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, - // errors come after warnings, not vice versa: - private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; - private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + // If there is either a warning OR an error, it will be in the last field. If there is both + // a warning AND an error, the warning will be in the second-to-last field, and the error will + // be in the last field. + private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1; + private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2; + private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1; - private static final int SNPEFF_CODING_FIELD_INDEX = 5; + // Position of the field indicating whether the effect is coding or non-coding. This field is used + // in selecting the most significant effect, but is not included in the annotations we return + // since it can be deduced from the SNPEFF_GENE_BIOTYPE field. + private static final int SNPEFF_CODING_FIELD_INDEX = 6; public SnpEffEffect ( String effectName, String[] effectMetadata ) { parseEffectName(effectName); @@ -447,11 +442,14 @@ private void parseEffectName ( String effectName ) { private void parseEffectMetadata ( String[] effectMetadata ) { if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { - if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { - parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) { + parseError(String.format("SnpEff issued the following warning or error: \"%s\"", + effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR])); } - else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { - parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) { + parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"", + effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR], + effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR])); } else { parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", @@ -461,23 +459,33 @@ else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { return; } - if ( effect != null && effect.isModifier() ) { - impact = EffectImpact.MODIFIER; + // The impact field will never be empty, and should always contain one of the enumerated values: + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); } - else { + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + } + + // The functional class field will be empty when the effect has no functional class associated with it: + if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) { try { - impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); + functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]); } catch ( IllegalArgumentException e ) { - parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()])); } } + else { + functionalClass = EffectFunctionalClass.NONE; + } codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; + // The coding field will be empty when SnpEff has no coding info for the effect: if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { try { coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); @@ -534,7 +542,7 @@ else if ( ! isCoding() && other.isCoding() ) { return true; } else if ( impact.isSameImpactAs(other.impact) ) { - return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass()); + return functionalClass.isHigherPriorityThan(other.functionalClass); } return false; @@ -545,13 +553,13 @@ public Map getAnnotations() { addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString()); addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); - addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString()); return annotations; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java new file mode 100644 index 0000000000..ecdde1e4fe --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.MendelianViolation; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 11/14/11 + */ + +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation { + + private Set trios = null; + private final static int REF = 0; + private final static int HET = 1; + private final static int HOM = 2; + + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + if ( trios == null ) { + if ( walker instanceof VariantAnnotator ) { + trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents(); + } else { + throw new UserException("Transmission disequilibrium test annotation can only be used from the Variant Annotator and requires a valid ped file be passed in."); + } + } + + final Map toRet = new HashMap(1); + final HashSet triosToTest = new HashSet(); + + for( final Sample child : trios) { + final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() && + vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() && + vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods(); + if ( hasAppropriateGenotypes ) { + triosToTest.add(child); + } + } + + toRet.put("TDT", calculateTDT( vc, triosToTest )); + + return toRet; + } + + // return the descriptions used for the VCF INFO meta field + public List getKeyNames() { return Arrays.asList("TDT"); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", 1, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); } + + // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT + private double calculateTDT( final VariantContext vc, final Set triosToTest ) { + + final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET); + final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET); + final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET); + final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET); + final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF); + final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF); + + final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB); + final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB); + return (numer * numer) / denom; + } + + private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) { + final double likelihoodVector[] = new double[triosToTest.size()]; + int iii = 0; + for( final Sample child : triosToTest ) { + final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector(); + final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector(); + final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector(); + likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx]; + } + + return MathUtils.sumLog10(likelihoodVector); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index c9937f3d65..69560c7cb1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -32,11 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; @@ -71,8 +69,9 @@ * -T VariantAnnotator \ * -I input.bam \ * -o output.vcf \ - * -A DepthOfCoverage + * -A DepthOfCoverage \ * --variant input.vcf \ + * -L input.vcf \ * --dbsnp dbsnp.vcf * * @@ -164,35 +163,32 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") protected Boolean LIST = false; - @Hidden - @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) - protected String ASSUME_SINGLE_SAMPLE = null; - @Hidden @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; - @Argument(fullName="family_string",shortName="family",required=false,doc="A family string of the form mom+dad=child for use with the mendelian violation ratio annotation") - public String familyStr = null; - @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio") public double minGenotypeQualityP = 0.0; + @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false) + private boolean requireStrictAlleleMatch = false; + private VariantAnnotatorEngine engine; private Collection indelBufferContext; private void listAnnotationsAndExit() { + System.out.println("\nStandard annotations in the list below are marked with a '*'."); List> infoAnnotationClasses = new PluginManager(InfoFieldAnnotation.class).getPlugins(); System.out.println("\nAvailable annotations for the VCF INFO field:"); for (int i = 0; i < infoAnnotationClasses.size(); i++) - System.out.println("\t" + infoAnnotationClasses.get(i).getSimpleName()); + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName()); System.out.println(); List> genotypeAnnotationClasses = new PluginManager(GenotypeAnnotation.class).getPlugins(); System.out.println("\nAvailable annotations for the VCF FORMAT field:"); for (int i = 0; i < genotypeAnnotationClasses.size(); i++) - System.out.println("\t" + genotypeAnnotationClasses.get(i).getSimpleName()); + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName()); System.out.println(); System.out.println("\nAvailable classes/groups of annotations:"); for ( Class c : new PluginManager(AnnotationType.class).getInterfaces() ) @@ -213,16 +209,12 @@ public void initialize() { List rodName = Arrays.asList(variantCollection.variants.getName()); Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); - // if there are no valid samples, warn the user - if ( samples.size() == 0 ) { - logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired."); - } - if ( USE_ALL_ANNOTATIONS ) engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit()); else engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit()); engine.initializeExpressions(expressionsToUse); + engine.setRequireStrictAlleleMatch(requireStrictAlleleMatch); // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones @@ -232,8 +224,33 @@ public void initialize() { if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } - for ( String expression : expressionsToUse ) - hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); + // for the expressions, pull the info header line from the header of the resource rod + for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { + // special case the ID field + if ( expression.fieldName.equals("ID") ) { + hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource")); + continue; + } + VCFInfoHeaderLine targetHeaderLine = null; + for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { + if ( line instanceof VCFInfoHeaderLine ) { + VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; + if ( infoline.getName().equals(expression.fieldName) ) { + targetHeaderLine = infoline; + break; + } + } + } + + if ( targetHeaderLine != null ) { + if ( targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER ) + hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); + else + hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription())); + } else { + hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); + } + } engine.invokeAnnotationInitializationMethods(hInfo); @@ -301,9 +318,9 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo Map stratifiedContexts; if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { if ( ! context.hasExtendedEventPileup() ) { - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup(), ASSUME_SINGLE_SAMPLE); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup()); } else { - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup(), ASSUME_SINGLE_SAMPLE); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup()); } if ( stratifiedContexts != null ) { annotatedVCs = new ArrayList(VCs.size()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index e4bc0d5d5c..98d2fe17b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -34,7 +34,9 @@ import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import java.util.*; @@ -45,24 +47,26 @@ public class VariantAnnotatorEngine { private List requestedGenotypeAnnotations; private List requestedExpressions = new ArrayList(); - private HashMap, String> dbAnnotations = new HashMap, String>(); - private AnnotatorCompatibleWalker walker; - private GenomeAnalysisEngine toolkit; + private final HashMap, String> dbAnnotations = new HashMap, String>(); + private final AnnotatorCompatibleWalker walker; + private final GenomeAnalysisEngine toolkit; - private static class VAExpression { + private boolean requireStrictAlleleMatch = false; + + protected static class VAExpression { public String fullName, fieldName; public RodBinding binding; - public VAExpression(String fullEpression, List> bindings) { - int indexOfDot = fullEpression.lastIndexOf("."); + public VAExpression(String fullExpression, List> bindings) { + int indexOfDot = fullExpression.lastIndexOf("."); if ( indexOfDot == -1 ) - throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format"); + throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format"); - fullName = fullEpression; - fieldName = fullEpression.substring(indexOfDot+1); + fullName = fullExpression; + fieldName = fullExpression.substring(indexOfDot+1); - String bindingName = fullEpression.substring(0, indexOfDot); + String bindingName = fullExpression.substring(0, indexOfDot); for ( RodBinding rod : bindings ) { if ( rod.getName().equals(bindingName) ) { binding = rod; @@ -97,6 +101,8 @@ public void initializeExpressions(List expressionsToUse) { requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } + protected List getRequestedExpressions() { return requestedExpressions; } + private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude) { AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse); requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse); @@ -159,12 +165,15 @@ public Set getVCFAnnotationDescriptions() { return descriptions; } - public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public void setRequireStrictAlleleMatch( final boolean requireStrictAlleleMatch ) { + this.requireStrictAlleleMatch = requireStrictAlleleMatch; + } + public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences - annotateDBs(tracker, ref, vc, infoAnnotations); + vc = annotateDBs(tracker, ref, vc, infoAnnotations); // annotate expressions where available annotateExpressions(tracker, ref, infoAnnotations); @@ -177,24 +186,24 @@ public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceConte } // generate a new annotated VC - final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations); + VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process - return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc)); + return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); } - private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { + private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null); // annotate dbsnp id if available and not already there - if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) ) - infoAnnotations.put(VariantContext.ID_KEY, rsID); + if ( rsID != null && vc.emptyID() ) + vc = new VariantContextBuilder(vc).id(rsID).make(); } else { boolean overlapsComp = false; for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { - if ( !comp.isFiltered() ) { + if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) { overlapsComp = true; break; } @@ -202,6 +211,8 @@ private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, Varia infoAnnotations.put(dbSet.getValue(), overlapsComp); } } + + return vc; } private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map infoAnnotations) { @@ -211,21 +222,25 @@ private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext re continue; VariantContext vc = VCs.iterator().next(); - if ( vc.hasAttribute(expression.fieldName) ) + // special-case the ID field + if ( expression.fieldName.equals("ID") ) { + if ( vc.hasID() ) + infoAnnotations.put(expression.fullName, vc.getID()); + } else if ( vc.hasAttribute(expression.fieldName) ) { infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + } } } - private Map annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( requestedGenotypeAnnotations.size() == 0 ) return vc.getGenotypes(); - Map genotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry g : vc.getGenotypes().entrySet() ) { - Genotype genotype = g.getValue(); - AlignmentContext context = stratifiedContexts.get(g.getKey()); + GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) { - genotypes.put(g.getKey(), genotype); + genotypes.add(genotype); continue; } @@ -235,7 +250,7 @@ private Map annotateGenotypes(RefMetaDataTracker tracker, Refe if ( result != null ) genotypeAnnotations.putAll(result); } - genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased())); + genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased())); } return genotypes; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 7f6dabeecb..f827856be9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -36,10 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -125,7 +122,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { protected static String line = null; private final double MIN_PROB_ERROR = 0.000001; - private final double MAX_GENOTYPE_QUALITY = 6.0; + private final double MAX_GENOTYPE_QUALITY = -6.0; public void initialize() { @@ -181,8 +178,8 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC // ignore places where we don't have a variant if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null) { - vcfWriter.add(vc_input); - return 1; + vcfWriter.add(vc_input); + return 1; } @@ -190,8 +187,7 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC byte refByte = ref.getBase(); // make new Genotypes based on Beagle results - Map genotypes = new HashMap(vc_input.getGenotypes().size()); - + GenotypesContext genotypes = GenotypesContext.create(vc_input.getGenotypes().size()); // for each genotype, create a new object with Beagle information on it @@ -200,15 +196,13 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC Double alleleFrequencyH = 0.0; int beagleVarCounts = 0; - Map hapmapGenotypes = null; + GenotypesContext hapmapGenotypes = null; if (vc_comp != null) { hapmapGenotypes = vc_comp.getGenotypes(); } - for ( Map.Entry originalGenotypes : vc_input.getGenotypes().entrySet() ) { - - Genotype g = originalGenotypes.getValue(); + for ( final Genotype g : vc_input.getGenotypes() ) { Set filters = new LinkedHashSet(g.getFilters()); boolean genotypeIsPhased = true; @@ -218,7 +212,7 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC // use sample as key into genotypes structure if (vc_comp != null) { - if (vc_input.getGenotypes().containsKey(sample) && hapmapGenotypes.containsKey(sample)) { + if (vc_input.getGenotypes().containsSample(sample) && hapmapGenotypes.containsSample(sample)) { Genotype hapmapGenotype = hapmapGenotypes.get(sample); if (hapmapGenotype.isCalled()){ @@ -255,9 +249,9 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC Allele bglAlleleA, bglAlleleB; if (alleleA.matches(refString)) - bglAlleleA = Allele.create(alleleA,true); + bglAlleleA = Allele.create(alleleA,true); else - bglAlleleA = Allele.create(alleleA,false); + bglAlleleA = Allele.create(alleleA,false); if (alleleB.matches(refString)) bglAlleleB = Allele.create(alleleB,true); @@ -286,7 +280,7 @@ else if ((bglAlleleB.isReference() && bglAlleleA.isNonReference()) || (bglAllele // deal with numerical errors coming from limited formatting value on Beagle output files if (probWrongGenotype > 1 - MIN_PROB_ERROR) probWrongGenotype = 1 - MIN_PROB_ERROR; - + if (1-probWrongGenotype < noCallThreshold) { // quality is bad: don't call genotype alleles.clear(); @@ -298,7 +292,7 @@ else if ((bglAlleleB.isReference() && bglAlleleA.isNonReference()) || (bglAllele if (probWrongGenotype < MIN_PROB_ERROR) genotypeQuality = MAX_GENOTYPE_QUALITY; else - genotypeQuality = -log10(probWrongGenotype); + genotypeQuality = log10(probWrongGenotype); HashMap originalAttributes = new HashMap(g.getAttributes()); @@ -329,47 +323,40 @@ else if (originalAlleleB.isReference()) else { originalAttributes.put("OG","."); } - Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); + Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) { beagleVarCounts++; } - genotypes.put(originalGenotypes.getKey(), imputedGenotype); - + genotypes.add(imputedGenotype); } - VariantContext filteredVC; - if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) - filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes()); - else { + final VariantContextBuilder builder = new VariantContextBuilder(vc_input).source("outputvcf").genotypes(genotypes); + if ( ! ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) ) { Set removedFilters = vc_input.filtersWereApplied() ? new HashSet(vc_input.getFilters()) : new HashSet(1); removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0))); - filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes()); + builder.alleles(new HashSet(Arrays.asList(vc_input.getReference()))).filters(removedFilters); } - HashMap attributes = new HashMap(filteredVC.getAttributes()); // re-compute chromosome counts - VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false); + VariantContextUtils.calculateChromosomeCounts(builder, false); // Get Hapmap AC and AF if (vc_comp != null) { - attributes.put("ACH", alleleCountH.toString() ); - attributes.put("ANH", chrCountH.toString() ); - attributes.put("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) ); + builder.attribute("ACH", alleleCountH.toString() ); + builder.attribute("ANH", chrCountH.toString() ); + builder.attribute("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) ); } - attributes.put("NumGenotypesChanged", numGenotypesChangedByBeagle ); + builder.attribute("NumGenotypesChanged", numGenotypesChangedByBeagle ); if( !beagleR2Feature.getR2value().equals(Double.NaN) ) { - attributes.put("R2", beagleR2Feature.getR2value().toString() ); + builder.attribute("R2", beagleR2Feature.getR2value().toString() ); } - - vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes)); - + vcfWriter.add(builder.make()); return 1; - } public Integer reduceInit() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index b722220f9c..aa71f4399a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.PrintStream; @@ -204,7 +201,7 @@ public boolean useValidation(VariantContext validation, ReferenceContext ref) { logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER)); + bootstrapVCFOutput.add(new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make()); } bootstrapSetSize++; return true; @@ -245,18 +242,18 @@ public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC } if ( markers != null ) markers.append("\n"); - Map preferredGenotypes = preferredVC.getGenotypes(); - Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; + GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); + GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for ( String sample : samples ) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure - if ( preferredGenotypes.keySet().contains(sample) ) { + if ( preferredGenotypes.containsSample(sample) ) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; - } else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) { + } else if ( otherGenotypes != null && otherGenotypes.containsSample(sample) ) { genotype = otherGenotypes.get(sample); isValidation = ! isValidationSite; } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java index ef47ee33c5..41b17cc7b6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java @@ -31,7 +31,6 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.Map; /** @@ -68,7 +67,8 @@ private DiffNode tableToNode(GATKReportTable table, DiffNode root) { for ( GATKReportColumn column : table.getColumns().values() ) { DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot); - columnRoot.add("Width", column.getColumnWidth()); + columnRoot.add("Width", column.getColumnFormat().getWidth()); + // NOTE: as the values are trimmed during parsing left/right alignment is not currently preserved columnRoot.add("Displayable", column.isDisplayable()); int n = 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index a447d17afe..3c0da8e9d1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; +import org.apache.log4j.Logger; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -32,7 +33,6 @@ import java.io.File; import java.io.FileInputStream; -import java.io.FileReader; import java.io.IOException; import java.util.Map; @@ -46,6 +46,8 @@ * Class implementing diffnode reader for VCF */ public class VCFDiffableReader implements DiffableReader { + private static Logger logger = Logger.getLogger(VCFDiffableReader.class); + @Override public String getName() { return "VCF"; } @@ -68,7 +70,10 @@ public DiffElement readFromFile(File file, int maxElementsToRead) { String key = headerLine.getKey(); if ( headerLine instanceof VCFNamedHeaderLine ) key += "_" + ((VCFNamedHeaderLine) headerLine).getName(); - root.add(key, headerLine.toString()); + if ( root.hasElement(key) ) + logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString()); + else + root.add(key, headerLine.toString()); } String line = lineReader.readLine(); @@ -90,22 +95,22 @@ public DiffElement readFromFile(File file, int maxElementsToRead) { // add fields vcRoot.add("CHROM", vc.getChr()); vcRoot.add("POS", vc.getStart()); - vcRoot.add("ID", vc.hasID() ? vc.getID() : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("ID", vc.getID()); vcRoot.add("REF", vc.getReference()); vcRoot.add("ALT", vc.getAlternateAlleles()); - vcRoot.add("QUAL", vc.hasNegLog10PError() ? vc.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4); vcRoot.add("FILTER", vc.getFilters()); // add info fields for (Map.Entry attribute : vc.getAttributes().entrySet()) { - if ( ! attribute.getKey().startsWith("_") && ! attribute.getKey().equals(VariantContext.ID_KEY)) + if ( ! attribute.getKey().startsWith("_") ) vcRoot.add(attribute.getKey(), attribute.getValue()); } - for (Genotype g : vc.getGenotypes().values() ) { + for (Genotype g : vc.getGenotypes() ) { DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); gRoot.add("GT", g.getGenotypeString()); - gRoot.add("GQ", g.hasNegLog10PError() ? g.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4 ); + gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 ); for (Map.Entry attribute : g.getAttributes().entrySet()) { if ( ! attribute.getKey().startsWith("_") ) @@ -129,6 +134,6 @@ public DiffElement readFromFile(File file, int maxElementsToRead) { @Override public boolean canRead(File file) { - return AbstractVCFCodec.canDecodeFile(file, VCFCodec.VCF4_MAGIC_HEADER); + return AbstractVCFCodec.canDecodeFile(file.getPath(), VCFCodec.VCF4_MAGIC_HEADER); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index bf3606b54d..8278dbab76 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -36,9 +36,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -224,7 +222,7 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo (vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied Set filters = new LinkedHashSet(vc.getFilters()); filters.add(MASK_NAME); - vc = VariantContext.modifyFilters(vc, filters); + vc = new VariantContextBuilder(vc).filters(filters).make(); } FiltrationContext varContext = new FiltrationContext(ref, vc); @@ -267,7 +265,7 @@ private VariantContext checkMaskForPreviousLocation(VariantContext vc, GenomeLoc (vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied Set filters = new LinkedHashSet(vc.getFilters()); filters.add(MASK_NAME); - vc = VariantContext.modifyFilters(vc, filters); + vc = new VariantContextBuilder(vc).filters(filters).make(); } return vc; @@ -279,20 +277,15 @@ private void filter() { if ( context == null ) return; - VariantContext vc = context.getVariantContext(); + final VariantContext vc = context.getVariantContext(); + final VariantContextBuilder builder = new VariantContextBuilder(vc); // make new Genotypes based on filters - Map genotypes; - if ( genotypeFilterExps.size() == 0 ) { - genotypes = null; - } else { - genotypes = new HashMap(vc.getGenotypes().size()); + if ( genotypeFilterExps.size() > 0 ) { + GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); // for each genotype, check filters then create a new object - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { - - Genotype g = genotype.getValue(); - + for ( final Genotype g : vc.getGenotypes() ) { if ( g.isCalled() ) { Set filters = new LinkedHashSet(g.getFilters()); @@ -300,11 +293,13 @@ private void filter() { if ( VariantContextUtils.match(vc, g, exp) ) filters.add(exp.name); } - genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.isPhased())); + genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased())); } else { - genotypes.put(genotype.getKey(), g); + genotypes.add(g); } } + + builder.genotypes(genotypes); } // make a new variant context based on filters @@ -324,14 +319,9 @@ private void filter() { filters.add(exp.name); } } + builder.filters(filters); - VariantContext filteredVC; - if ( genotypes == null ) - filteredVC = VariantContext.modifyFilters(vc, filters); - else - filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes()); - - writer.add(filteredVC); + writer.add(builder.make()); } public Integer reduce(Integer value, Integer sum) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 35a9fe31d3..681cc1fa68 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -26,16 +26,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import java.io.PrintStream; import java.util.List; -import java.util.Map; -import java.util.Set; /** @@ -47,8 +42,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { /** The default model with the best performance in all cases */ EXACT, - /** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */ - GRID_SEARCH } protected int N; @@ -58,7 +51,7 @@ public enum Model { protected enum GenotypeType { AA, AB, BB } - protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE; + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { this.N = N; @@ -68,24 +61,12 @@ protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, /** * Must be overridden by concrete subclasses - * @param GLs genotype likelihoods - * @param Alleles Alleles corresponding to GLs - * @param log10AlleleFrequencyPriors priors - * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results + * @param GLs genotype likelihoods + * @param Alleles Alleles corresponding to GLs + * @param log10AlleleFrequencyPriors priors + * @param result (pre-allocated) object to store likelihoods results */ - protected abstract void getLog10PNonRef(Map GLs, List Alleles, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors); - - /** - * Can be overridden by concrete subclasses - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyPosteriors allele frequency results - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - protected abstract Map assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood); + protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, + double[][] log10AlleleFrequencyPriors, + AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java new file mode 100644 index 0000000000..9c4af8512f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Dec 14, 2011 + * + * Useful helper class to communicate the results of the allele frequency calculation + */ +public class AlleleFrequencyCalculationResult { + + // IMPORTANT NOTE: + // These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N). + // For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that + // the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may + // be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0). + // In the bi-allelic case (where there are no other alternate alleles over which to marginalize), + // the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED. + final double[][] log10AlleleFrequencyLikelihoods; + final double[][] log10AlleleFrequencyPosteriors; + + // These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + double log10LikelihoodOfAFzero = 0.0; + double log10PosteriorOfAFzero = 0.0; + + public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { + log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; + log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; + } + + public double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } + + public double getLog10PosteriorOfAFzero() { + return log10PosteriorOfAFzero; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java deleted file mode 100644 index fbd9c1dbf4..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.variantcontext.Allele; - -public class BiallelicGenotypeLikelihoods { - - private String sample; - private double[] GLs; - private Allele A, B; - private int depth; - - /** - * Create a new object for sample with given alleles and genotype likelihoods - * - * @param sample sample name - * @param A allele A - * @param B allele B - * @param log10AALikelihoods AA likelihoods - * @param log10ABLikelihoods AB likelihoods - * @param log10BBLikelihoods BB likelihoods - * @param depth the read depth used in creating the likelihoods - */ - public BiallelicGenotypeLikelihoods(String sample, - Allele A, - Allele B, - double log10AALikelihoods, - double log10ABLikelihoods, - double log10BBLikelihoods, - int depth) { - this.sample = sample; - this.A = A; - this.B = B; - this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods}; - this.depth = depth; - } - - public String getSample() { - return sample; - } - - public double getAALikelihoods() { - return GLs[0]; - } - - public double getABLikelihoods() { - return GLs[1]; - } - - public double getBBLikelihoods() { - return GLs[2]; - } - - public double[] getLikelihoods() { - return GLs; - } - - public Allele getAlleleA() { - return A; - } - - public Allele getAlleleB() { - return B; - } - - public int getDepth() { - return depth; - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 106bb19825..09936c1128 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -27,13 +27,6 @@ import org.broadinstitute.sting.utils.BaseUtils; -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Aug 4, 2009 - * Time: 6:46:09 PM - * To change this template use File | Settings | File Templates. - */ public enum DiploidGenotype { AA ('A', 'A'), AC ('A', 'C'), @@ -110,6 +103,20 @@ public static DiploidGenotype createDiploidGenotype(byte base1, byte base2) { return conversionMatrix[index1][index2]; } + /** + * create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly + * @param baseIndex1 base1 + * @param baseIndex2 base2 + * @return the diploid genotype + */ + public static DiploidGenotype createDiploidGenotype(int baseIndex1, int baseIndex2) { + if ( baseIndex1 == -1 ) + throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character"); + if ( baseIndex2 == -1 ) + throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character"); + return conversionMatrix[baseIndex1][baseIndex2]; + } + private static final DiploidGenotype[][] conversionMatrix = { { DiploidGenotype.AA, DiploidGenotype.AC, DiploidGenotype.AG, DiploidGenotype.AT }, { DiploidGenotype.AC, DiploidGenotype.CC, DiploidGenotype.CG, DiploidGenotype.CT }, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 666fe88a36..295cf86884 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -28,7 +28,6 @@ import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -275,19 +274,20 @@ public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQ public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); if ( elt.isReducedRead() ) { // reduced read representation - byte qual = elt.getQual(); - if ( BaseUtils.isRegularBase( elt.getBase() )) { + if ( BaseUtils.isRegularBase( obsBase )) { add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods return elt.getRepresentativeCount(); // we added nObs bases here - } else // odd bases or deletions => don't use them - return 0; - } else { - byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; + } + + // odd bases or deletions => don't use them + return 0; } + + return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; } public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { @@ -511,20 +511,19 @@ protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte ch * @return */ private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) { + if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) return 0; - } else { - byte qual = p.getQual(); - if ( qual > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); - if ( capBaseQualsAtMappingQual ) - qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); - if ( (int)qual < minBaseQual ) - qual = (byte)0; + byte qual = p.getQual(); - return qual; - } + if ( qual > SAMUtils.MAX_PHRED_SCORE ) + throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + if ( capBaseQualsAtMappingQual ) + qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); + if ( (int)qual < minBaseQual ) + qual = (byte)0; + + return qual; } // ----------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 1c2d82ab73..aa743f86fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -26,84 +26,43 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - // - // code for testing purposes - // + private final static boolean DEBUG = false; + private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - private final boolean SIMPLE_GREEDY_GENOTYPER = false; - private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); } - public void getLog10PNonRef(Map GLs, List alleles, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { + public void getLog10PNonRef(final GenotypesContext GLs, + final List alleles, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { final int numAlleles = alleles.size(); - final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null; - final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null; - - int idxDiag = numAlleles; - int incr = numAlleles - 1; - for (int k=1; k < numAlleles; k++) { - // multi-allelic approximation, part 1: Ideally - // for each alt allele compute marginal (suboptimal) posteriors - - // compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of - // as a row-wise upper triangular matrix of likelihoods. - // So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC. - // 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD - - final int idxAA = 0; - final int idxAB = k; - // yy is always element on the diagonal. - // 2 alleles: BBelement 2 - // 3 alleles: BB element 3. CC element 5 - // 4 alleles: - final int idxBB = idxDiag; - idxDiag += incr--; - - final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB); - - if (numAlleles > 2) { - posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone(); - bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors); - } - } - - if (numAlleles > 2) { - // multiallelic approximation, part 2: - // report posteriors for allele that has highest estimated AC - int mostLikelyAlleleIdx = MathUtils.maxElementIndex(bestAFguess); - for (int k=0; k < log10AlleleFrequencyPosteriors.length-1; k++) - log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]); - } + //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false); } - private static final ArrayList getGLs(Map GLs) { - ArrayList genotypeLikelihoods = new ArrayList(); + private static final ArrayList getGLs(GenotypesContext GLs) { + ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.values() ) { + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { if ( sample.hasLikelihoods() ) { double[] gls = sample.getLikelihoods().getAsVector(); - if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL) + if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) genotypeLikelihoods.add(gls); } } @@ -112,9 +71,397 @@ private static final ArrayList getGLs(Map GLs) { } + final static double approximateLog10SumLog10(double[] vals) { + if ( vals.length < 2 ) + throw new ReviewedStingException("Passing array with fewer than 2 values when computing approximateLog10SumLog10"); + + double approx = approximateLog10SumLog10(vals[0], vals[1]); + for ( int i = 2; i < vals.length; i++ ) + approx = approximateLog10SumLog10(approx, vals[i]); + return approx; + } + + final static double approximateLog10SumLog10(double small, double big) { + // make sure small is really the smaller value + if ( small > big ) { + final double t = big; + big = small; + small = t; + } + + if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY ) + return big; + + if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + //final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding + int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding + + //double z =Math.log10(1+Math.pow(10.0,-diff)); + //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); + return big + MathUtils.jacobianLogTable[ind]; + } + + // ------------------------------------------------------------------------------------- // - // Linearized, ~O(N), implementation. + // Multi-allelic implementation. + // + // ------------------------------------------------------------------------------------- + + private static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + + // a wrapper around the int array so that we can make it hashable + private static final class ExactACcounts { + + private final int[] counts; + private int hashcode = -1; + + public ExactACcounts(final int[] counts) { + this.counts = counts; + } + + public int[] getCounts() { + return counts; + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof ExactACcounts) ? Arrays.equals(counts, ((ExactACcounts)obj).counts) : false; + } + + @Override + public int hashCode() { + if ( hashcode == -1 ) + hashcode = Arrays.hashCode(counts); + return hashcode; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append(counts[0]); + for ( int i = 1; i < counts.length; i++ ) { + sb.append("/"); + sb.append(counts[i]); + } + return sb.toString(); + } + } + + // This class represents a column in the Exact AC calculation matrix + private static final class ExactACset { + + // the counts of the various alternate alleles which this column represents + final ExactACcounts ACcounts; + + // the column of the matrix + final double[] log10Likelihoods; + + // mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column; + // for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB. + final HashMap ACsetIndexToPLIndex = new HashMap(); + + // to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them + final ArrayList dependentACsetsToDelete = new ArrayList(); + + + public ExactACset(final int size, final ExactACcounts ACcounts) { + this.ACcounts = ACcounts; + log10Likelihoods = new double[size]; + } + + // sum of all the non-reference alleles + public int getACsum() { + int sum = 0; + for ( int count : ACcounts.getCounts() ) + sum += count; + return sum; + } + + public boolean equals(Object obj) { + return (obj instanceof ExactACset) ? ACcounts.equals(((ExactACset)obj).ACcounts) : false; + } + } + + public static void linearExactMultiAllelic(final GenotypesContext GLs, + final int numAlternateAlleles, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result, + final boolean preserveData) { + + // make sure the PL cache has been initialized + if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null ) + UnifiedGenotyperEngine.calculatePLcache(5); + + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final Queue ACqueue = new LinkedList(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(numChr+1); + + // add AC=0 to the queue + int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.ACcounts, zeroSet); + + // keep processing while we have AC conformations that need to be calculated + double maxLog10L = Double.NEGATIVE_INFINITY; + while ( !ACqueue.isEmpty() ) { + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); + + // adjust max likelihood seen if needed + maxLog10L = Math.max(maxLog10L, log10LofKs); + } + } + + private static double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double maxLog10L, + final int numChr, + final boolean preserveData, + final Queue ACqueue, + final HashMap indexesToACset, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + if ( DEBUG ) + System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result); + + // clean up memory + if ( !preserveData ) { + for ( ExactACcounts index : set.dependentACsetsToDelete ) { + indexesToACset.put(index, null); + if ( DEBUG ) + System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); + } + } + + final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + if ( DEBUG ) + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + + // no reason to keep this data around because nothing depends on it + if ( !preserveData ) + indexesToACset.put(set.ACcounts, null); + + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing + final int numAltAlleles = set.ACcounts.getCounts().length; + + // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. + // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. + + // add conformations for the k+1 case + int PLindex = 0; + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele]++; + lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset); + } + } + } + + // if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate + // over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early) + if ( !preserveData && lastSet == null ) { + if ( DEBUG ) + System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts); + lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); + } + if ( lastSet != null ) + lastSet.dependentACsetsToDelete.add(set.ACcounts); + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also adds it as a dependency to the given callingSetIndex. + // returns the ExactACset if that set was not already in the queue and null otherwise. + private static ExactACset updateACset(final int[] ACcounts, + final int numChr, + final ExactACset callingSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset) { + final ExactACcounts index = new ExactACcounts(ACcounts); + boolean wasInQueue = true; + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + wasInQueue = false; + } + + // add the given dependency to the set + final ExactACset set = indexesToACset.get(index); + set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); + return wasInQueue ? null : set; + } + + private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue ACqueue) { + ExactACset set = null; + for ( ExactACset queued : ACqueue ) { + if ( queued.dependentACsetsToDelete.contains(callingSetIndex) ) + set = queued; + } + return set; + } + + private static void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final HashMap indexesToACset, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + set.log10Likelihoods[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) + set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + } + // k > 0 for at least one k + else { + // all possible likelihoods for a given cell from which to choose the max + final int numPaths = set.ACsetIndexToPLIndex.size() + 1; + final double[] log10ConformationLikelihoods = new double[numPaths]; // TODO can be created just once, since you initialize it + + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + final double[] gl = genotypeLikelihoods.get(j); + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + // initialize + for ( int i = 0; i < numPaths; i++ ) + // TODO -- Arrays.fill? + // todo -- is this even necessary? Why not have as else below? + log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY; + + // deal with the AA case first + if ( totalK < 2*j-1 ) + log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + + // deal with the other possible conformations now + if ( totalK <= 2*j ) { // skip impossible conformations + int conformationIndex = 1; + for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { + if ( DEBUG ) + System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); + log10ConformationLikelihoods[conformationIndex++] = + determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()]; + } + } + + final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods); + + // finally, update the L(j,k) value + set.log10Likelihoods[j] = log10Max - logDenominator; + } + } + + final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // determine the power of theta to use + int nonRefAlleles = 0; + for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { + if ( set.ACcounts.getCounts()[i] > 0 ) + nonRefAlleles++; + } + + // for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object + if ( nonRefAlleles == 0 ) { + result.log10LikelihoodOfAFzero = log10LofK; + result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0]; + } else { + // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs + for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { + int AC = set.ACcounts.getCounts()[i]; + result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); + + final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; + result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); + } + } + } + + private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + final int numAltAlleles = ACcounts.length; + + // the AX het case + if ( PLindex <= numAltAlleles ) + return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; + + // find the 2 alternate alleles that are represented by this PL index + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex]; + + final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( alleles[0] == alleles[1] ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[alleles[1]-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + + // ------------------------------------------------------------------------------------- + // + // Deprecated bi-allelic ~O(N) implementation. Kept here for posterity. // // ------------------------------------------------------------------------------------- @@ -122,6 +469,7 @@ private static final ArrayList getGLs(Map GLs) { * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors * for the exact model calculation */ +/* private final static class ExactACCache { double[] kMinus2, kMinus1, kMinus0; @@ -155,9 +503,10 @@ final public double[] getkMinus0() { } } - public int linearExact(Map GLs, + public int linearExact(GenotypesContext GLs, double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { + double[][] log10AlleleFrequencyLikelihoods, + double[][] log10AlleleFrequencyPosteriors) { final ArrayList genotypeLikelihoods = getGLs(GLs); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -174,7 +523,7 @@ public int linearExact(Map GLs, if ( k == 0 ) { // special case for k = 0 for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA]; + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; } } else { // k > 0 final double[] kMinus1 = logY.getkMinus1(); @@ -187,14 +536,14 @@ public int linearExact(Map GLs, double aa = Double.NEGATIVE_INFINITY; double ab = Double.NEGATIVE_INFINITY; if (k < 2*j-1) - aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[idxAA]; + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; if (k < 2*j) - ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[idxAB]; + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; double log10Max; if (k > 1) { - final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[idxBB]; + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; log10Max = approximateLog10SumLog10(aa, ab, bb); } else { // we know we aren't considering the BB case, so we can use an optimized log10 function @@ -208,7 +557,8 @@ public int linearExact(Map GLs, // update the posteriors vector final double log10LofK = kMinus0[numSamples]; - log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k]; + log10AlleleFrequencyLikelihoods[0][k] = log10LofK; + log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k]; // can we abort early? lastK = k; @@ -225,229 +575,8 @@ public int linearExact(Map GLs, } final static double approximateLog10SumLog10(double a, double b, double c) { - //return softMax(new double[]{a, b, c}); return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c); } +*/ - final static double approximateLog10SumLog10(double small, double big) { - // make sure small is really the smaller value - if ( small > big ) { - final double t = big; - big = small; - small = t; - } - - if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY ) - return big; - - if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE) - return big; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - //final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding - int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding - - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - return big + MathUtils.jacobianLogTable[ind]; - } - - - - /** - * Can be overridden by concrete subclasses - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyPosteriors allele frequency results - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - public Map assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood) { - if ( !vc.isVariant() ) - throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - - - Map GLs = vc.getGenotypes(); - double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; - int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1]; - - ArrayList sampleIndices = new ArrayList(); - int sampleIdx = 0; - - // todo - optimize initialization - for (int k=0; k <= AFofMaxLikelihood; k++) - for (int j=0; j <= GLs.size(); j++) - pathMetricArray[j][k] = -1e30; - - pathMetricArray[0][0] = 0.0; - - // todo = can't deal with optimal dynamic programming solution with multiallelic records - if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - sampleIndices.addAll(GLs.keySet()); - sampleIdx = GLs.size(); - } - else { - - for ( Map.Entry sample : GLs.entrySet() ) { - if ( !sample.getValue().hasLikelihoods() ) - continue; - - double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); - - if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) { - //System.out.print(sample.getKey()+":"); - //for (int k=0; k < likelihoods.length; k++) - // System.out.format("%4.2f ",likelihoods[k]); - //System.out.println(); - // all likelihoods are essentially the same: skip this sample and will later on force no call. - //sampleIdx++; - continue; - } - - sampleIndices.add(sample.getKey()); - - for (int k=0; k <= AFofMaxLikelihood; k++) { - - double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0]; - int bestIndex = k; - - if (k>0) { - double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1]; - if (m2 > bestMetric) { - bestMetric = m2; - bestIndex = k-1; - } - } - - if (k>1) { - double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2]; - if (m2 > bestMetric) { - bestMetric = m2; - bestIndex = k-2; - } - } - - pathMetricArray[sampleIdx+1][k] = bestMetric; - tracebackArray[sampleIdx+1][k] = bestIndex; - } - sampleIdx++; - } - } - - HashMap calls = new HashMap(); - - int startIdx = AFofMaxLikelihood; - for (int k = sampleIdx; k > 0; k--) { - int bestGTguess; - String sample = sampleIndices.get(k-1); - Genotype g = GLs.get(sample); - if ( !g.hasLikelihoods() ) - continue; - // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now, - // and will add no-call genotype to GL's in a second pass - ArrayList myAlleles = new ArrayList(); - - double qual = Double.NEGATIVE_INFINITY; - double[] likelihoods = g.getLikelihoods().getAsVector(); - - if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); - } - else { - int newIdx = tracebackArray[k][startIdx];; - bestGTguess = startIdx - newIdx; - startIdx = newIdx; - } - - /* System.out.format("Sample: %s GL:",sample); - for (int i=0; i < likelihoods.length; i++) - System.out.format("%1.4f, ",likelihoods[i]); - */ - - for (int i=0; i < likelihoods.length; i++) { - if (i==bestGTguess) - continue; - if (likelihoods[i] >= qual) - qual = likelihoods[i]; - } - // qual contains now max(likelihoods[k]) for all k != bestGTguess - qual = likelihoods[bestGTguess] - qual; - - // likelihoods are stored row-wise in lower triangular matrix. IE - // for 2 alleles they have ordering AA,AB,BB - // for 3 alleles they are ordered AA,AB,BB,AC,BC,CC - // Get now alleles corresponding to best index - int kk=0; - boolean done = false; - for (int j=0; j < vc.getNAlleles(); j++) { - for (int i=0; i <= j; i++){ - if (kk++ == bestGTguess) { - if (i==0) - myAlleles.add(vc.getReference()); - else - myAlleles.add(vc.getAlternateAllele(i-1)); - - if (j==0) - myAlleles.add(vc.getReference()); - else - myAlleles.add(vc.getAlternateAllele(j-1)); - done = true; - break; - } - - } - if (done) - break; - } - - if (qual < 0) { - // QUAL can be negative if the chosen genotype is not the most likely one individually. - // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on - double[] normalized = MathUtils.normalizeFromLog10(likelihoods); - double chosenGenotype = normalized[bestGTguess]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); - } - //System.out.println(myAlleles.toString()); - calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); - - } - - for ( Map.Entry sample : GLs.entrySet() ) { - - if ( !sample.getValue().hasLikelihoods() ) - continue; - Genotype g = GLs.get(sample.getKey()); - - double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); - - if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL) - continue; // regular likelihoods - - ArrayList myAlleles = new ArrayList(); - - double qual = Genotype.NO_NEG_LOG_10PERROR; - myAlleles.add(Allele.NO_CALL); - myAlleles.add(Allele.NO_CALL); - //System.out.println(myAlleles.toString()); - calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false)); - } - return calls; - } - - private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) { - int j = logYMatrix.length - 1; - System.out.printf("-----------------------------------%n"); - for (int k=0; k <= numChr; k++) { - double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior); - } - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 489e963e83..b30a254148 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -81,25 +80,23 @@ protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Log * @param contexts stratified alignment contexts * @param contextType stratified context type * @param priors priors to use for GLs - * @param GLs hash of sample->GL to fill in * @param alternateAlleleToUse the alternate allele to use, null if not set - * - * @param useBAQedPileup - * @return genotype likelihoods per sample for AA, AB, BB + * @param useBAQedPileup should we use the BAQed pileup or the raw one? + * @return variant context where genotypes are no-called but with GLs */ - public abstract Allele getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Map GLs, - Allele alternateAlleleToUse, boolean useBAQedPileup); + public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup); protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; for ( PileupElement p : pileup ) { if ( BaseUtils.isRegularBase( p.getBase() ) ) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java deleted file mode 100755 index 27842a8bf7..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; -import java.util.*; - -public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel { - - // for use in optimizing the P(D|AF) calculations: - // how much off from the max likelihoods do we need to be before we can quit calculating? - protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0; - - private AlleleFrequencyMatrix AFMatrix; - - protected GridSearchAFEstimation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { - super(UAC, N, logger, verboseWriter); - AFMatrix = new AlleleFrequencyMatrix(N); - } - - protected void getLog10PNonRef(Map GLs, List alleles, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { - initializeAFMatrix(GLs); - - // first, calculate for AF=0 (no change to matrix) - log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0]; - double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0]; - - int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2; - - // for each minor allele frequency, calculate log10PofDgivenAFi - for (int i = 1; i <= maxAlleleFrequencyToTest; i++) { - // add one more alternate allele - AFMatrix.incrementFrequency(); - - // calculate new likelihoods - log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i]; - - // an optimization to speed up the calculation: if we are beyond the local maximum such - // that subsequent likelihoods won't factor into the confidence score, just quit - if ( maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON ) - return; - - if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen ) - maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i]; - } - } - - /** - * Overrides the super class - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyPosteriors allele frequency results - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - protected Map assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood) { - if ( !vc.isVariant() ) - throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - - Allele refAllele = vc.getReference(); - Allele altAllele = vc.getAlternateAllele(0); - HashMap calls = new HashMap(); - - // first, the potential alt calls - for ( String sample : AFMatrix.getSamples() ) { - Genotype g = vc.getGenotype(sample); - - // set the genotype and confidence - Pair AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample); - ArrayList myAlleles = new ArrayList(); - if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) { - myAlleles.add(refAllele); - myAlleles.add(refAllele); - } else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) { - myAlleles.add(refAllele); - myAlleles.add(altAllele); - } else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() ) - myAlleles.add(altAllele); - myAlleles.add(altAllele); - } - - calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, g.getAttributes(), false)); - } - - return calls; - } - - private void initializeAFMatrix(Map GLs) { - AFMatrix.clear(); - - for ( Genotype g : GLs.values() ) { - if ( g.hasLikelihoods() ) - AFMatrix.setLikelihoods(g.getLikelihoods().getAsVector(), g.getSampleName()); - } - } - - protected static class AlleleFrequencyMatrix { - - private double[][] matrix; // allele frequency matrix - private int[] indexes; // matrix to maintain which genotype is active - private int maxN; // total possible frequencies in data - private int frequency; // current frequency - - // data structures necessary to maintain a list of the best genotypes and their scores - private ArrayList samples = new ArrayList(); - private HashMap>> samplesToGenotypesPerAF = new HashMap>>(); - - public AlleleFrequencyMatrix(int N) { - maxN = N; - matrix = new double[N][3]; - indexes = new int[N]; - clear(); - } - - public List getSamples() { return samples; } - - public void clear() { - frequency = 0; - for (int i = 0; i < maxN; i++) - indexes[i] = 0; - samples.clear(); - samplesToGenotypesPerAF.clear(); - } - - public void setLikelihoods(double[] GLs, String sample) { - int index = samples.size(); - samples.add(sample); - matrix[index][GenotypeType.AA.ordinal()] = GLs[0]; - matrix[index][GenotypeType.AB.ordinal()] = GLs[1]; - matrix[index][GenotypeType.BB.ordinal()] = GLs[2]; - } - - public void incrementFrequency() { - int N = samples.size(); - if ( frequency == 2 * N ) - throw new ReviewedStingException("Frequency was incremented past N; how is this possible?"); - frequency++; - - double greedy = VALUE_NOT_CALCULATED; - int greedyIndex = -1; - for (int i = 0; i < N; i++) { - - if ( indexes[i] == GenotypeType.AB.ordinal() ) { - if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) { - greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()]; - greedyIndex = i; - } - } - else if ( indexes[i] == GenotypeType.AA.ordinal() ) { - if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) { - greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()]; - greedyIndex = i; - } - // note that we currently don't bother with breaking ties between samples - // (which would be done by looking at the HOM_VAR value) because it's highly - // unlikely that a collision will both occur and that the difference will - // be significant at HOM_VAR... - } - // if this person is already hom var, he can't add another alternate allele - // so we can ignore that case - } - if ( greedyIndex == -1 ) - throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?"); - - if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() ) - indexes[greedyIndex] = GenotypeType.BB.ordinal(); - else - indexes[greedyIndex] = GenotypeType.AB.ordinal(); - } - - public double getLikelihoodsOfFrequency() { - double likelihoods = 0.0; - int N = samples.size(); - for (int i = 0; i < N; i++) - likelihoods += matrix[i][indexes[i]]; - - /* - System.out.println(frequency); - for (int i = 0; i < N; i++) { - System.out.print(samples.get(i)); - for (int j=0; j < 3; j++) { - System.out.print(String.valueOf(matrix[i][j])); - System.out.print(indexes[i] == j ? "* " : " "); - } - System.out.println(); - } - System.out.println(likelihoods); - System.out.println(); - */ - - recordGenotypes(); - - return likelihoods; - } - - public Pair getGenotype(int frequency, String sample) { - return samplesToGenotypesPerAF.get(frequency).get(sample); - } - - private void recordGenotypes() { - HashMap> samplesToGenotypes = new HashMap>(); - - int index = 0; - for ( String sample : samples ) { - int genotype = indexes[index]; - - double score; - - int maxEntry = MathUtils.maxElementIndex(matrix[index]); - // if the max value is for the most likely genotype, we can compute next vs. next best - if ( genotype == maxEntry ) { - if ( genotype == GenotypeType.AA.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]); - else if ( genotype == GenotypeType.AB.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]); - else // ( genotype == GenotypeType.HOM.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]); - } - // otherwise, we need to calculate the probability of the genotype - else { - double[] normalized = MathUtils.normalizeFromLog10(matrix[index]); - double chosenGenotype = normalized[genotype]; - score = -1.0 * Math.log10(1.0 - chosenGenotype); - } - - samplesToGenotypes.put(sample, new Pair(genotype, Math.abs(score))); - index++; - } - - samplesToGenotypesPerAF.put(frequency, samplesToGenotypes); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 14d647b6d2..fe2086d474 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -34,6 +34,8 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -41,8 +43,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -125,7 +126,7 @@ private ArrayList computeConsensusAlleles(ReferenceContext ref, for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; if(ReadUtils.is454Read(read)) { @@ -243,7 +244,7 @@ else if (p.isDeletion()) { // get deletion length int dLen = Integer.valueOf(bestAltAllele.substring(1)); // get ref bases of accurate deletion - int startIdxInReference = (int)(1+loc.getStart()-ref.getWindow().getStart()); + int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); //System.out.println(new String(ref.getBases())); byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); @@ -270,19 +271,17 @@ else if (p.isDeletion()) { private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); - public Allele getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Map GLs, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup) { if ( tracker == null ) return null; - GenomeLoc loc = ref.getLocus(); Allele refAllele, altAllele; VariantContext vc = null; @@ -368,10 +367,17 @@ public Allele getLikelihoods(RefMetaDataTracker tracker, haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), ref, hsize, numPrefBases); + // start making the VariantContext + final int endLoc = calculateEndPos(alleleList, refAllele, loc); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase()); + + // create the genotypes; no-call everyone for now + GenotypesContext genotypes = GenotypesContext.create(); + final List noCall = new ArrayList(); + noCall.add(Allele.NO_CALL); + // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. - // initialize the GenotypeLikelihoods - GLs.clear(); for ( Map.Entry sample : contexts.entrySet() ) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); @@ -384,11 +390,12 @@ else if (context.hasBasePileup()) if (pileup != null ) { final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); - GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), - alleleList, - genotypeLikelihoods, - getFilteredDepth(pileup))); + HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); + genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); if (DEBUG) { System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString()); @@ -399,9 +406,25 @@ else if (context.hasBasePileup()) } } - return refAllele; + return builder.genotypes(genotypes).make(); } + private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { + // for indels, stop location is one more than ref allele length + boolean hasNullAltAllele = false; + for ( Allele a : alleles ) { + if ( a.isNull() ) { + hasNullAltAllele = true; + break; + } + } + + int endLoc = loc.getStart() + refAllele.length(); + if( !hasNullAltAllele ) + endLoc--; + + return endLoc; + } public static HashMap> getIndelLikelihoodMap() { return indelLikelihoodMap.get(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java deleted file mode 100755 index 4f378b24a9..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; - -import java.util.ArrayList; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: 6/1/11 - * Time: 10:38 AM - * To change this template use File | Settings | File Templates. - */ -public class MultiallelicGenotypeLikelihoods { - private String sample; - private double[] GLs; - private List alleleList; - private int depth; - - public MultiallelicGenotypeLikelihoods(String sample, - List A, - double[] log10Likelihoods, int depth) { - /* Check for consistency between likelihood vector and number of alleles */ - int numAlleles = A.size(); - if (log10Likelihoods.length != numAlleles*(numAlleles+1)/2) - throw new StingException(("BUG: Incorrect length of GL vector when creating MultiallelicGenotypeLikelihoods object!")); - - this.sample = sample; - this.alleleList = A; - this.GLs = log10Likelihoods; - this.depth = depth; - } - - public String getSample() { - return sample; - } - - public double[] getLikelihoods() { - return GLs; - } - - public List getAlleles() { - return alleleList; - } - - public int getDepth() { - return depth; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 9bdc754e95..57cc5594a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -31,107 +31,139 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - // the alternate allele with the largest sum of quality scores - protected Byte bestAlternateAllele = null; + private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50; + + private boolean ALLOW_MULTIPLE_ALLELES; private final boolean useAlleleFromVCF; protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); + ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC; useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; } - public Allele getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Map GLs, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup) { if ( !(priors instanceof DiploidSNPGenotypePriors) ) throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); - byte refBase = ref.getBase(); - Allele refAllele = Allele.create(refBase, true); + final boolean[] basesToUse = new boolean[4]; + final byte refBase = ref.getBase(); + final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); + + // start making the VariantContext + final GenomeLoc loc = ref.getLocus(); + final List alleles = new ArrayList(); + alleles.add(Allele.create(refBase, true)); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); - // find the alternate allele with the largest sum of quality scores + // find the alternate allele(s) that we should be using if ( alternateAlleleToUse != null ) { - bestAlternateAllele = alternateAlleleToUse.getBases()[0]; + basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true; } else if ( useAlleleFromVCF ) { - VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); + final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); - // ignore places where we don't have a variant - if ( vc == null ) + // ignore places where we don't have a SNP + if ( vc == null || !vc.isSNP() ) return null; - if ( !vc.isBiallelic() ) { - // for multi-allelic sites go back to the reads and find the most likely alternate allele - initializeBestAlternateAllele(refBase, contexts, useBAQedPileup); - } else { - bestAlternateAllele = vc.getAlternateAllele(0).getBases()[0]; - } + for ( Allele allele : vc.getAlternateAlleles() ) + basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true; } else { - initializeBestAlternateAllele(refBase, contexts, useBAQedPileup); - } - // if there are no non-ref bases... - if ( bestAlternateAllele == null ) { - // if we only want variants, then we don't need to calculate genotype likelihoods - if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) - return refAllele; + determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup); + + // how many alternate alleles are we using? + int alleleCounter = Utils.countSetBits(basesToUse); - // otherwise, choose any alternate allele (it doesn't really matter) - bestAlternateAllele = (byte)(refBase != 'A' ? 'A' : 'C'); + // if there are no non-ref alleles... + if ( alleleCounter == 0 ) { + // if we only want variants, then we don't need to calculate genotype likelihoods + if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) + return builder.make(); + + // otherwise, choose any alternate allele (it doesn't really matter) + basesToUse[indexOfRefBase == 0 ? 1 : 0] = true; + } } - Allele altAllele = Allele.create(bestAlternateAllele, false); + // create the alternate alleles and the allele ordering (the ordering is crucial for the GLs) + final int numAltAlleles = Utils.countSetBits(basesToUse); + final int[] alleleOrdering = new int[numAltAlleles + 1]; + alleleOrdering[0] = indexOfRefBase; + int alleleOrderingIndex = 1; + int numLikelihoods = 1; + for ( int i = 0; i < 4; i++ ) { + if ( i != indexOfRefBase && basesToUse[i] ) { + alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); + alleleOrdering[alleleOrderingIndex++] = i; + numLikelihoods += alleleOrderingIndex; + } + } + builder.alleles(alleles); + + // create the genotypes; no-call everyone for now + GenotypesContext genotypes = GenotypesContext.create(); + final List noCall = new ArrayList(); + noCall.add(Allele.NO_CALL); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); - if( useBAQedPileup ) { pileup = createBAQedPileup( pileup ); } + if ( useBAQedPileup ) + pileup = createBAQedPileup( pileup ); // create the GenotypeLikelihoods object - DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); - int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); + final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); + final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); if ( nGoodBases == 0 ) continue; - double[] likelihoods = GL.getLikelihoods(); + final double[] allLikelihoods = GL.getLikelihoods(); + final double[] myLikelihoods = new double[numLikelihoods]; - DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase); - DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele); - DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele); - ArrayList aList = new ArrayList(); - aList.add(refAllele); - aList.add(altAllele); - double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ; + int myLikelihoodsIndex = 0; + for ( int i = 0; i <= numAltAlleles; i++ ) { + for ( int j = i; j <= numAltAlleles; j++ ) { + myLikelihoods[myLikelihoodsIndex++] = allLikelihoods[DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal()]; + } + } // normalize in log space so that max element is zero. - GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), - aList, MathUtils.normalizeFromLog10(dlike, false, true), getFilteredDepth(pileup))); + GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); + + HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); + genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); } - return refAllele; + return builder.genotypes(genotypes).make(); } - protected void initializeBestAlternateAllele(byte ref, Map contexts, boolean useBAQedPileup) { + // fills in the allelesToUse array + protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map contexts, boolean useBAQedPileup) { int[] qualCounts = new int[4]; for ( Map.Entry sample : contexts.entrySet() ) { @@ -139,7 +171,7 @@ protected void initializeBestAlternateAllele(byte ref, Map maxCount ) { - maxCount = qualCounts[index]; - bestAlternateAllele = altAllele; + if ( ALLOW_MULTIPLE_ALLELES ) { + for ( byte altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) + continue; + int index = BaseUtils.simpleBaseToBaseIndex(altAllele); + if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) { + allelesToUse[index] = true; + } } + } else { + // set the non-ref base which has the maximum quality score sum + int maxCount = 0; + int indexOfMax = 0; + for ( byte altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) + continue; + int index = BaseUtils.simpleBaseToBaseIndex(altAllele); + if ( qualCounts[index] > maxCount ) { + maxCount = qualCounts[index]; + indexOfMax = index; + } + } + + if ( maxCount > 0 ) + allelesToUse[indexOfMax] = true; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java new file mode 100755 index 0000000000..e40054c9f7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java @@ -0,0 +1,209 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.commons.lang.NotImplementedException; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.security.cert.CertificateNotYetValidException; +import java.util.*; + +import org.broadinstitute.sting.utils.codecs.vcf.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/30/11 + * Time: 10:08 AM + * To change this template use File | Settings | File Templates. + */ +public class UGBoundAF extends RodWalker { + + @Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true) + VCFWriter writer; + + @Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true) + List> variants; + + private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0); + + private HashMap> epsilonPosteriorCache = new HashMap>(8192); + private HashMap logAC0Cache = new HashMap(8192); + private int QUANTIZATION_FACTOR = 1000; + + + public void initialize() { + Set allHeaderLines = new HashSet(1024); + for ( RodBinding v : variants ) { + String trackName = v.getName(); + Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + Set headerLines = new HashSet(vcfHeaders.get(trackName).getMetaData()); + } + allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+ + "frequency. First value=95% probability AF>x. Second value=95% probability AF allVariants = tracker.getValues(variants); + if ( allVariants.size() == 0 ) { + return null; + } + + List alternateAlleles = getAllAlternateAlleles(allVariants); + VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet())); + if ( alternateAlleles.size() > 1 ) { + logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString()); + alternateAlleles.add(allVariants.get(0).getReference()); + builder.alleles(alternateAlleles); + builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles))); + } else { + // get all the genotype likelihoods + GenotypesContext context = GenotypesContext.create(); + int numNoCall = 0; + for ( VariantContext v : allVariants ) { + numNoCall += v.getNoCallCount(); + context.addAll(v.getGenotypes()); + } + builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context))); + } + + return builder.make(); + } + + private List getAllAlternateAlleles(List variants) { + List alleles = new ArrayList(3); // some overhead + for ( VariantContext v : variants ) { + alleles.addAll(v.getAlternateAlleles()); + } + return alleles; + } + + @Override + public Integer reduce(VariantContext value, Integer sum) { + if ( value == null ) + return sum; + writer.add(value); + return ++sum; + } + + private int N_ITERATIONS = 1; + private double[] getACPosteriors(GenotypesContext gc) { + // note this uses uniform priors (!) + + double[][] zeroPriors = new double[1][1+2*gc.size()]; + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size()); + // todo -- allow multiple alleles here + for ( int i = 0; i < N_ITERATIONS; i ++ ) { + ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false); + } + + return result.log10AlleleFrequencyPosteriors[0]; + } + + private String boundAlleleFrequency(double[] ACLikelihoods) { + // note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs + // thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be + // agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over + + // want to pick a lower limit x and upper limit y such that + // int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95 + // int_{f=x to y} calculateAFPosterior(f) df = 0.95 + // and that (y-x) is minimized + + // this is done by quantizing [0,1] into small bins and, since the distribution is + // unimodal, greedily adding them until the probability is >= 0.95 + + throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented")); + } + + private double calculateAFPosterior(double[] likelihoods, double af) { + double[] probLiks = new double[likelihoods.length]; + for ( int c = 0; c < likelihoods.length; c++) { + probLiks[c] = calculateAFPosterior(c,likelihoods.length,af); + } + + return MathUtils.log10sumLog10(probLiks); + } + + private double calculateAFPosterior(int ac, int n, double af) { + // evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes + switch ( ac ) { + case 0: + return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af); + case 1: + return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT); + case 2: + return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT); + default: + return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac); + } + } + + private double logAC0Coef(int an) { + if ( ! logAC0Cache.containsKey(an) ) { + double coef = -Math.log10(EPS_LOWER_LIMIT); + for ( int k = 1; k <= an; k++ ) { + // note this should typically just be + // term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k + // but the 1-E term will just be 1, so we do the following to mitigate this problem + double binom = MathUtils.binomialCoefficient(an,k); + double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k); + double term = binom/k - Math.pow(eps_correction,k); + if ( k % 2 == 0 ) { + coef += term; + } else { + coef -= term; + } + } + + logAC0Cache.put(an,coef); + } + + return logAC0Cache.get(an); + } + + private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) { + double mid = (start + stop)/2; + double size = stop-start; + double fa = calculateAFPosterior(likelihoods,start); + double fb = calculateAFPosterior(likelihoods,mid); + double fc = calculateAFPosterior(likelihoods,stop); + double s = (size/6)*(fa + 4*fc + fb); + double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap); + return h; + } + + private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){ + if ( s == 0 ) + return -300.0; + double c = ( a + b )/2; + double h = b-a; + double d = (a + c)/2; + double e = (c + b)/2; + double fd = calculateAFPosterior(likelihoods, d); + double fe = calculateAFPosterior(likelihoods, e); + double s_l = (h/12)*(fa + 4*fd + fc); + double s_r = (h/12)*(fc + 4*fe + fb); + double s_2 = s_l + s_r; + if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){ + return Math.log10(s_2 + (s_2 - s)/15.0); + } + + return ExactAFCalculationModel.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java index 503d87cbea..c7e5773937 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java @@ -39,7 +39,6 @@ import java.util.HashSet; import java.util.Set; -import java.util.TreeSet; /** @@ -71,12 +70,7 @@ public class UGCalcLikelihoods extends LocusWalker public void initialize() { // get all of the unique sample names - // if we're supposed to assume a single sample, do so - Set samples = new TreeSet(); - if ( UAC.ASSUME_SINGLE_SAMPLE != null ) - samples.add(UAC.ASSUME_SINGLE_SAMPLE); - else - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index d88e556878..97f7b21eb7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -35,9 +35,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -108,9 +106,9 @@ public Integer reduce(VariantCallContext value, Integer sum) { return sum; try { - Map attrs = new HashMap(value.getAttributes()); - VariantContextUtils.calculateChromosomeCounts(value, attrs, true); - writer.add(VariantContext.modifyAttributes(value, attrs)); + VariantContextBuilder builder = new VariantContextBuilder(value); + VariantContextUtils.calculateChromosomeCounts(builder, true); + writer.add(builder.make()); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } @@ -128,27 +126,27 @@ private static VariantContext mergeVCsWithGLs(List VCs) { return null; VariantContext variantVC = null; - Map genotypes = new HashMap(); + GenotypesContext genotypes = GenotypesContext.create(); for ( VariantContext vc : VCs ) { if ( variantVC == null && vc.isVariant() ) variantVC = vc; - genotypes.putAll(getGenotypesWithGLs(vc.getGenotypes())); + genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes())); } if ( variantVC == null ) { VariantContext vc = VCs.get(0); throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart()); } - return new VariantContext("VCwithGLs", variantVC.getChr(), variantVC.getStart(), variantVC.getEnd(), variantVC.getAlleles(), genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, null); + + return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make(); } - private static Map getGenotypesWithGLs(Map genotypes) { - Map genotypesWithGLs = new HashMap(); - for ( Map.Entry g : genotypes.entrySet() ) { - if ( g.getValue().hasLikelihoods() && g.getValue().getLikelihoods().getAsVector() != null ) - genotypesWithGLs.put(g.getKey(), g.getValue()); + private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) { + GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size()); + for ( final Genotype g : genotypes ) { + if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null ) + genotypesWithGLs.add(g); } - return genotypesWithGLs; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 07d9892a10..5713432b42 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -25,10 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -96,11 +93,6 @@ public class UnifiedArgumentCollection { @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when in GENOTYPE_MODE = GENOTYPE_GIVEN_ALLELES", required=false) public RodBinding alleles; - // control the error modes - @Hidden - @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) - public String ASSUME_SINGLE_SAMPLE = null; - /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -111,6 +103,22 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; + /** + * The default behavior of the Unified Genotyper is to allow the genotyping of just one alternate allele in discovery mode; using this flag + * will enable the discovery of multiple alternate alleles. Please note that this works for SNPs only and that it is still highly experimental. + * For advanced users only. + */ + @Advanced + @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false) + public boolean MULTI_ALLELIC = false; + + /** + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. + */ + @Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) + public int MAX_ALTERNATE_ALLELES = 5; + // indel-related arguments /** * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. @@ -137,15 +145,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - //gdebug+ - // experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!! -// @Hidden -// @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false) -// public boolean GET_GAP_PENALTIES_FROM_DATA = false; -// -// @Hidden -// @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") -// public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); @Hidden @Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false) public boolean BANDED_INDEL_COMPUTATION = false; @@ -170,7 +169,6 @@ public UnifiedArgumentCollection clone() { uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; uac.COMPUTE_SLOD = COMPUTE_SLOD; - uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; @@ -182,10 +180,12 @@ public UnifiedArgumentCollection clone() { uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; uac.alleles = alleles; + uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION; + uac.MULTI_ALLELIC = MULTI_ALLELIC; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 72dc217e14..369c2d0c68 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -206,12 +206,7 @@ public static class UGStatistics { **/ public void initialize() { // get all of the unique sample names - // if we're supposed to assume a single sample, do so - Set samples = new TreeSet(); - if ( UAC.ASSUME_SINGLE_SAMPLE != null ) - samples.add(UAC.ASSUME_SINGLE_SAMPLE); - else - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // initialize the verbose writer if ( verboseWriter != null ) @@ -263,7 +258,7 @@ private static Set getSupportedHeaderStrings() { Set result = new HashSet(); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); return result; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 993a434ac0..aa33d39e3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -50,14 +50,18 @@ public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; public enum OUTPUT_MODE { - /** the default */ + /** produces calls only at variant sites */ EMIT_VARIANTS_ONLY, - /** include confident reference sites */ + /** produces calls at variant sites and confident reference sites */ EMIT_ALL_CONFIDENT_SITES, - /** any callable site regardless of confidence */ + /** produces calls at any callable site regardless of confidence; this argument is intended for point + * mutations (SNPs) only and while some indel calls may be produced they are by no means comprehensive */ EMIT_ALL_SITES } + protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + protected static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + // the unified argument collection private final UnifiedArgumentCollection UAC; public UnifiedArgumentCollection getUAC() { return UAC; } @@ -71,12 +75,13 @@ public enum OUTPUT_MODE { // the model used for calculating p(non-ref) private ThreadLocal afcm = new ThreadLocal(); - // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[] log10AlleleFrequencyPriorsSNPs; - private final double[] log10AlleleFrequencyPriorsIndels; + // the allele frequency likelihoods and posteriors (allocated once as an optimization) + private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); + private ThreadLocal posteriorsArray = new ThreadLocal(); - // the allele frequency likelihoods (allocated once as an optimization) - private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything + private final double[][] log10AlleleFrequencyPriorsSNPs; + private final double[][] log10AlleleFrequencyPriorsIndels; // the priors object private final GenotypePriors genotypePriorsSNPs; @@ -95,8 +100,12 @@ public enum OUTPUT_MODE { // the standard filter to use for calls below the confidence threshold but above the emit threshold private static final Set filter = new HashSet(1); + private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; + // a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + // the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)] + protected static int[][][] PLIndexToAlleleIndex; // --------------------------------------------------------------------------------------------------------- @@ -106,17 +115,13 @@ public enum OUTPUT_MODE { // --------------------------------------------------------------------------------------------------------- @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, - // get the number of samples - // if we're supposed to assume a single sample, do so - UAC.ASSUME_SINGLE_SAMPLE != null ? - new TreeSet(Arrays.asList(UAC.ASSUME_SINGLE_SAMPLE)) : - SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader())); + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader())); } @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples) { this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; + genomeLocParser = toolkit.getGenomeLocParser(); this.samples = new TreeSet(samples); // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ this.UAC = UAC.clone(); @@ -126,14 +131,35 @@ public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentColle this.annotationEngine = engine; N = 2 * this.samples.size(); - log10AlleleFrequencyPriorsSNPs = new double[N+1]; - log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, GenotypeLikelihoodsCalculationModel.Model.INDEL); + log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); filter.add(LOW_QUAL_FILTER_NAME); + calculatePLcache(UAC.MAX_ALTERNATE_ALLELES); + } + + protected static void calculatePLcache(int maxAltAlleles) { + PLIndexToAlleleIndex = new int[maxAltAlleles+1][][]; + PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} }; + int numLikelihoods = 1; + + // for each count of alternate alleles + for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) { + numLikelihoods += altAlleles + 1; + PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][]; + int PLindex = 0; + + // for all possible combinations of the 2 alt alleles + for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) { + for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) { + PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 }; + } + } + } } /** @@ -156,7 +182,6 @@ public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tr } VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); - if ( vc == null ) return null; @@ -218,14 +243,7 @@ private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, Referenc glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - Map GLs = new HashMap(); - - Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); - - if ( refAllele != null ) - return createVariantContextFromLikelihoods(refContext, refAllele, GLs); - else - return null; + return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -234,8 +252,7 @@ private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, Refe VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if ( vcInput == null ) return null; - vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase()); - + vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).make(); } else { // deal with bad/non-standard reference bases if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) @@ -243,7 +260,7 @@ private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, Refe Set alleles = new HashSet(); alleles.add(Allele.create(ref.getBase(), true)); - vc = new VariantContext("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles); + vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); } if ( annotationEngine != null ) { @@ -253,7 +270,7 @@ private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, Refe pileup = rawContext.getExtendedEventPileup(); else if (rawContext.hasBasePileup()) pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); } @@ -261,145 +278,164 @@ else if (rawContext.hasBasePileup()) return new VariantCallContext(vc, false); } - private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map GLs) { - // no-call everyone for now - List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); - - Set alleles = new LinkedHashSet(); - alleles.add(refAllele); - boolean addedAltAlleles = false; - - HashMap genotypes = new HashMap(); - for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) { - if ( !addedAltAlleles ) { - addedAltAlleles = true; - // ordering important to maintain consistency - for (Allele a: GL.getAlleles()) { - alleles.add(a); - } - } - - HashMap attributes = new HashMap(); - //GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(GL.getLikelihoods()); - attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth()); - attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); - - genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false)); - } - - GenomeLoc loc = refContext.getLocus(); - int endLoc = calculateEndPos(alleles, refAllele, loc); - - return new VariantContext("UG_call", - loc.getContig(), - loc.getStart(), - endLoc, - alleles, - genotypes, - VariantContext.NO_NEG_LOG_10PERROR, - null, - null, - refContext.getBase()); + public VariantCallContext calculateGenotypes(VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(null, null, null, null, vc, model); } - // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine - private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + + boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - log10AlleleFrequencyPosteriors.set(new double[N+1]); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); + alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); + posteriorsArray.set(new double[N + 2]); + } + AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); + + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) { + logger.warn("the Unified Genotyper is currently set to genotype at most " + UAC.MAX_ALTERNATE_ALLELES + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + vc.getAlternateAlleles().size() + " alternate alleles; see the --max_alternate_alleles argument"); + return null; } // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) + if ( vc.getNSamples() == 0 ) { + if ( limitedContext ) + return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); + } // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); - - // find the most likely frequency - int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + + // is the most likely frequency conformation AC=0 for all alternate alleles? + boolean bestGuessIsRef = true; + + // determine which alternate alleles have AF>0 + boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[i]); + + // if the most likely AC is not 0, then this is a good alternate allele to use; + // make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array + if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[i][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { + altAllelesToUse[i] = true; + bestGuessIsRef = false; + } + // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele + else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + altAllelesToUse[i] = true; + } + } - // calculate p(f>0) - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); - double sum = 0.0; - for (int i = 1; i <= N; i++) - sum += normalizedPosteriors[i]; - double PofF = Math.min(sum, 1.0); // deal with precision errors + // calculate p(f>0): + // because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them + final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); + final double PofF = 1.0 - normalizedPosteriors[0]; double phredScaledConfidence; - if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; + phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero; } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - sum = 0.0; + double sum = AFresult.log10AlleleFrequencyPosteriors[0][0]; + if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + sum = 0.0; for (int i = 1; i <= N; i++) { - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) break; - sum += log10AlleleFrequencyPosteriors.get()[i]; + sum += AFresult.log10AlleleFrequencyPosteriors[0][i]; } phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) { + if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate - return estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); + } + + // strip out any alleles that aren't going to be used in the VariantContext + final List myAlleles; + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { + myAlleles = new ArrayList(vc.getAlleles().size()); + myAlleles.add(vc.getReference()); + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + if ( altAllelesToUse[i] ) + myAlleles.add(vc.getAlternateAllele(i)); + } + } else { + // use all of the alleles if we are given them by the user + myAlleles = vc.getAlleles(); + } + + // start constructing the resulting VC + final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); + builder.log10PError(phredScaledConfidence/-10.0); + if ( ! passesCallThreshold(phredScaledConfidence) ) + builder.filters(filter); + if ( limitedContext ) { + builder.referenceBaseForIndel(vc.getReferenceBaseForIndel()); + } else { + builder.referenceBaseForIndel(refContext.getBase()); } // create the genotypes - Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); + final GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse); // print out stats if we have a writer - if ( verboseWriter != null ) + if ( verboseWriter != null && !limitedContext ) printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last - HashMap attributes = new HashMap(); + final HashMap attributes = new HashMap(); // if the site was downsampled, record that fact - if ( rawContext.hasPileupBeenDownsampled() ) + if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - - if ( UAC.COMPUTE_SLOD && bestAFguess != 0 ) { + if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); - //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; + double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); - //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; + double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); - //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); + clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), AFresult); + //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); + double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; + double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -412,30 +448,23 @@ private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, Refere strandScore *= 10.0; //logger.debug(String.format("SLOD=%f", strandScore)); - attributes.put("SB", strandScore); + if ( !Double.isNaN(strandScore) ) + attributes.put("SB", strandScore); } - GenomeLoc loc = refContext.getLocus(); - - int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); - - Set myAlleles = new HashSet(vc.getAlleles()); - // strip out the alternate allele if it's a ref call - if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new HashSet(1); - myAlleles.add(vc.getReference()); - } - VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, refContext.getBase()); + // finish constructing the resulting VC + builder.genotypes(genotypes); + builder.attributes(attributes); + VariantContext vcCall = builder.make(); - if ( annotationEngine != null ) { + if ( annotationEngine != null && !limitedContext ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations ReadBackedPileup pileup = null; if (rawContext.hasExtendedEventPileup()) pileup = rawContext.getExtendedEventPileup(); else if (rawContext.hasBasePileup()) pileup = rawContext.getBasePileup(); - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); } @@ -443,112 +472,10 @@ else if (rawContext.hasBasePileup()) return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - // A barebones entry point to the exact model when there is no tracker or stratified contexts available -- only GLs - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenomeLoc loc, final GenotypeLikelihoodsCalculationModel.Model model) { - - // initialize the data for this thread if that hasn't been done yet - if ( afcm.get() == null ) { - log10AlleleFrequencyPosteriors.set(new double[N+1]); - afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - } - - // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) - return null; - - // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); - - // find the most likely frequency - int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); - - // calculate p(f>0) - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); - double sum = 0.0; - for (int i = 1; i <= N; i++) - sum += normalizedPosteriors[i]; - double PofF = Math.min(sum, 1.0); // deal with precision errors - - double phredScaledConfidence; - if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); - if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; - } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); - if ( Double.isInfinite(phredScaledConfidence) ) { - sum = 0.0; - for (int i = 1; i <= N; i++) { - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - break; - sum += log10AlleleFrequencyPosteriors.get()[i]; - } - phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); - } - } - - // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) { - // technically, at this point our confidence in a reference call isn't accurately estimated - // because it didn't take into account samples with no data, so let's get a better estimate - return null; - } - - // create the genotypes - Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); - - // *** note that calculating strand bias involves overwriting data structures, so we do that last - HashMap attributes = new HashMap(); - - int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); - - Set myAlleles = new HashSet(vc.getAlleles()); - // strip out the alternate allele if it's a ref call - if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new HashSet(1); - myAlleles.add(vc.getReference()); - } - VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, vc.getReferenceBaseForIndel()); - - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - } - - private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { - // TODO - temp fix until we can deal with extended events properly - // for indels, stop location is one more than ref allele length - boolean isSNP = true, hasNullAltAllele = false; - for (Allele a : alleles){ - if (a.length() != 1) { - isSNP = false; - break; - } - } - for (Allele a : alleles){ - if (a.isNull()) { - hasNullAltAllele = true; - break; - } - } - // standard deletion: ref allele length = del length. endLoc = startLoc + refAllele.length(), alt allele = null - // standard insertion: ref allele length = 0, endLos = startLoc - // mixed: want end loc = start Loc for case {A*,AT,T} but say {ATG*,A,T} : want then end loc = start loc + refAllele.length - // So, in general, end loc = startLoc + refAllele.length, except in complex substitutions where it's one less - // - // todo - this is unnecessarily complicated and is so just because of Tribble's arbitrary vc conventions, should be cleaner/simpler, - // the whole vc processing infrastructure seems too brittle and riddled with special case handling - - - int endLoc = loc.getStart(); - if ( !isSNP) { - endLoc += refAllele.length(); - if(!hasNullAltAllele) - endLoc--; - - } - - return endLoc; + private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) { + normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero; + System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1); + return MathUtils.normalizeFromLog10(normalizedPosteriors); } private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { @@ -569,7 +496,7 @@ private Map getFilteredAndStratifiedContexts(UnifiedAr return null; // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); } else { @@ -586,12 +513,12 @@ private Map getFilteredAndStratifiedContexts(UnifiedAr return null; // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); } } else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) { // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE); + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); if( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { int numDeletions = 0; @@ -607,9 +534,12 @@ private Map getFilteredAndStratifiedContexts(UnifiedAr return stratifiedContexts; } - protected static void clearAFarray(double[] AFs) { - for ( int i = 0; i < AFs.length; i++ ) - AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + protected static void clearAFarray(double[][] AFs) { + for ( int i = 0; i < AFs.length; i++ ) { + for ( int j = 0; j < AFs[i].length; j++ ) { + AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + } + } } private final static double[] binomialProbabilityDepthCache = new double[10000]; @@ -679,10 +609,10 @@ protected void printVerboseData(String pos, VariantContext vc, double PofF, doub AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) + if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) AFline.append("0.00000000\t"); else - AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i])); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i])); AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); verboseWriter.println(AFline.toString()); } @@ -692,8 +622,8 @@ protected void printVerboseData(String pos, VariantContext vc, double PofF, doub verboseWriter.println(); } - protected boolean passesEmitThreshold(double conf, int bestAFguess) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { + return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); } protected boolean passesCallThreshold(double conf) { @@ -747,27 +677,25 @@ else if (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) return null; } - protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) { - // calculate the allele frequency priors for 1-N - double sum = 0.0; - double heterozygosity; + protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { - if (model == GenotypeLikelihoodsCalculationModel.Model.INDEL) - heterozygosity = UAC.INDEL_HETEROZYGOSITY; - else - heterozygosity = UAC.heterozygosity; - - for (int i = 1; i <= N; i++) { - double value = heterozygosity / (double)i; - priors[i] = Math.log10(value); - sum += value; - } + // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i + for (int alleles = 1; alleles <= priors.length; alleles++) { + double sum = 0.0; + + // for each i + for (int i = 1; i <= N; i++) { + double value = Math.pow(theta, alleles) / (double)i; + priors[alleles-1][i] = Math.log10(value); + sum += value; + } - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[0] = Math.log10(1.0 - sum); + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[alleles-1][0] = Math.log10(1.0 - sum); + } } - protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { switch( model ) { case SNP: return log10AlleleFrequencyPriorsSNPs; @@ -816,9 +744,6 @@ private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObje case EXACT: afcm = new ExactAFCalculationModel(UAC, N, logger, verboseWriter); break; - case GRID_SEARCH: - afcm = new GridSearchAFEstimation(UAC, N, logger, verboseWriter); - break; default: throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); } @@ -843,4 +768,94 @@ public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, Ref return vc; } + + /** + * @param vc variant context with genotype likelihoods + * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use + * @return genotypes + */ + public static GenotypesContext assignGenotypes(final VariantContext vc, + final boolean[] allelesToUse) { + + // the no-called genotypes + final GenotypesContext GLs = vc.getGenotypes(); + + // samples + final List sampleIndices = GLs.getSampleNamesOrderedByName(); + + // the new called genotypes to create + final GenotypesContext calls = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = allelesToUse.length; + final List newAlleles = new ArrayList(numOriginalAltAlleles+1); + newAlleles.add(vc.getReference()); + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToUse[i] ) + newAlleles.add(vc.getAlternateAllele(i)); + } + final int numNewAltAlleles = newAlleles.size() - 1; + ArrayList likelihoodIndexesToUse = null; + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { + likelihoodIndexesToUse = new ArrayList(30); + final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; + + for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) { + int[] alleles = PLcache[PLindex]; + // consider this entry only if both of the alleles are good + if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) ) + likelihoodIndexesToUse.add(PLindex); + } + } + + // create the new genotypes + for ( int k = GLs.size() - 1; k >= 0; k-- ) { + final String sample = sampleIndices.get(k); + final Genotype g = GLs.get(sample); + if ( !g.hasLikelihoods() ) + continue; + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + continue; + } + + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; + + ArrayList myAlleles = new ArrayList(); + myAlleles.add(newAlleles.get(alleles[0])); + myAlleles.add(newAlleles.get(alleles[1])); + + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); + Map attrs = new HashMap(g.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); + calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false)); + } + + return calls; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index 3b3f54b05f..200a250f24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -205,7 +205,7 @@ public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord byte haplotypeBase; if (haplotypeIndex < RIGHT_ALIGN_INDEX) - haplotypeBase = haplotype.getBasesAsBytes()[haplotypeIndex]; + haplotypeBase = haplotype.getBases()[haplotypeIndex]; else haplotypeBase = (byte)0; // dummy @@ -217,7 +217,7 @@ public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord if (readQual > 3) pRead += pBaseRead; haplotypeIndex++; - if (haplotypeIndex >= haplotype.getBasesAsBytes().length) + if (haplotypeIndex >= haplotype.getBases().length) haplotypeIndex = RIGHT_ALIGN_INDEX; //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead); } @@ -227,8 +227,8 @@ public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord System.out.println(read.getReadName()); System.out.print("Haplotype:"); - for (int k=0; k LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) - haplotypeBase = haplotype.getBasesAsBytes()[indX-1]; + haplotypeBase = haplotype.getBases()[indX-1]; else haplotypeBase = readBase; @@ -296,8 +296,8 @@ public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord System.out.println(read.getReadName()); System.out.print("Haplotype:"); - for (int k=0; k = haplotype.getBasesAsBytes().length || indStart > indStop) { + if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) { // read spanned more than allowed reference context: we currently can't deal with this readLikelihood =0; } else { - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); if (matchMetricArray == null) { @@ -629,7 +631,7 @@ private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { return 0; // sanity check for (int i=0; i < b1.length; i++ ){ - if ( b1[i]!= b2[i]) + if ( b1[i]!= b2[i] ) return i; } return b1.length; @@ -640,7 +642,7 @@ private int computeFirstDifferingPosition(double[] b1, double[] b2) { return 0; // sanity check for (int i=0; i < b1.length; i++ ){ - if ( b1[i]!= b2[i]) + if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 ) return i; } return b1.length; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 414ffa09cc..aa9ae1517e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -58,9 +58,7 @@ import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -1057,16 +1055,15 @@ public void printVCFLine(VCFWriter vcf, IndelPrecall call) { stop += event_length; } - Map genotypes = new HashMap(); - + GenotypesContext genotypes = GenotypesContext.create(); for ( String sample : normalSamples ) { - Map attrs = call.makeStatsAttributes(null); + Map attrs = call.makeStatsAttributes(null); if ( call.isCall() ) // we made a call - put actual het genotype here: - genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false)); else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) - genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false)); } Set filters = null; @@ -1074,8 +1071,8 @@ public void printVCFLine(VCFWriter vcf, IndelPrecall call) { filters = new HashSet(); filters.add("NoCall"); } - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, null, refBases[(int)start-1]); + VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) + .genotypes(genotypes).filters(filters).referenceBaseForIndel(refBases[(int)start-1]).make(); vcf.add(vc); } @@ -1147,14 +1144,14 @@ public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) homRefAlleles.add( alleles.get(0)); homRefAlleles.add( alleles.get(0)); - Map genotypes = new HashMap(); + GenotypesContext genotypes = GenotypesContext.create(); for ( String sample : normalSamples ) { - genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false)); + genotypes.add(new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsNormal,false)); } for ( String sample : tumorSamples ) { - genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) ); + genotypes.add(new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsTumor,false) ); } Set filters = null; @@ -1171,8 +1168,8 @@ public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) filters.add("TCov"); } - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, attrs, refBases[(int)start-1]); + VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) + .genotypes(genotypes).filters(filters).attributes(attrs).referenceBaseForIndel(refBases[(int)start-1]).make(); vcf.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java index 5a32479aba..54838b55ec 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java @@ -29,7 +29,7 @@ import java.util.LinkedList; import java.util.List; -public abstract class BaseArray implements Comparable { +abstract class BaseArray implements Comparable { protected Byte[] bases; public BaseArray(byte[] bases) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java index 06f4d3ab23..45a1ab04c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CardinalityCounter.java @@ -30,7 +30,7 @@ /* * CardinalityCounter object allows user to iterate over all assignment of arbitrary-cardinality variables. */ -public class CardinalityCounter implements Iterator, Iterable { +class CardinalityCounter implements Iterator, Iterable { private int[] cards; private int[] valList; private boolean hasNext; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java index 4ec940f4f7..e88a7104dc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/CloneableIteratorLinkedList.java @@ -30,7 +30,7 @@ It is UNIQUE in the fact that its iterator (BidirectionalIterator) can be cloned to save the current pointer for a later time (while the original iterator can continue to iterate). */ -public class CloneableIteratorLinkedList { +class CloneableIteratorLinkedList { private CloneableIteratorDoublyLinkedNode first; private CloneableIteratorDoublyLinkedNode last; private int size; diff --git a/public/java/src/org/broadinstitute/sting/utils/DisjointSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/utils/DisjointSet.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java index 52c18e6d65..c054af5d6a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/DisjointSet.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/DisjointSet.java @@ -21,13 +21,13 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.gatk.walkers.phasing; import java.util.Collection; import java.util.Set; import java.util.TreeSet; -public class DisjointSet { +class DisjointSet { private ItemNode[] nodes; public DisjointSet(int numItems) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java index 3c20a311e6..61d5a725e5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java @@ -27,7 +27,7 @@ import java.util.Arrays; -public class Haplotype extends BaseArray implements Cloneable { +class Haplotype extends BaseArray implements Cloneable { public Haplotype(byte[] bases) { super(bases); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java deleted file mode 100644 index 306509d0cc..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java +++ /dev/null @@ -1,118 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.*; - -/** - * Merges read-back-phased and phase-by-transmission files. - */ -public class MergeAndMatchHaplotypes extends RodWalker { - @Output - protected VCFWriter vcfWriter = null; - - @Input(fullName="pbt", shortName = "pbt", doc="Input VCF truth file", required=true) - public RodBinding pbtTrack; - - @Input(fullName="rbp", shortName = "rbp", doc="Input VCF truth file", required=true) - public RodBinding rbpTrack; - - private Map pbtCache = new HashMap(); - private Map rbpCache = new HashMap(); - - private final String SOURCE_NAME = "MergeReadBackedAndTransmissionPhasedVariants"; - - public void initialize() { - ArrayList rodNames = new ArrayList(); - rodNames.add(pbtTrack.getName()); - - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - Set headerLines = new HashSet(); - headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - - vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker != null) { - Collection pbts = tracker.getValues(pbtTrack, ref.getLocus()); - Collection rbps = tracker.getValues(rbpTrack, ref.getLocus()); - - VariantContext pbt = pbts.iterator().hasNext() ? pbts.iterator().next() : null; - VariantContext rbp = rbps.iterator().hasNext() ? rbps.iterator().next() : null; - - if (pbt != null && rbp != null) { - Map genotypes = pbt.getGenotypes(); - - if (!rbp.isFiltered()) { - for (String sample : rbp.getSampleNames()) { - Genotype rbpg = rbp.getGenotype(sample); - Genotype pbtg = pbt.getGenotype(sample); - - // Propagate read-backed phasing information to genotypes unphased by transmission - //if (!pbtg.isPhased() && rbpCache.containsKey(sample)) { - if (!pbtg.isPhased() && rbpg.isPhased() && rbpCache.containsKey(sample)) { - boolean orientationMatches = rbpCache.get(sample).sameGenotype(pbtCache.get(sample), false); - - if (orientationMatches) { - pbtg = rbpg; - } else { - List fwdAlleles = rbpg.getAlleles(); - List revAlleles = new ArrayList(); - - for (int i = fwdAlleles.size() - 1; i >= 0; i--) { - revAlleles.add(fwdAlleles.get(i)); - } - - pbtg = new Genotype(sample, revAlleles, rbpg.getNegLog10PError(), rbpg.getFilters(), rbpg.getAttributes(), rbpg.isPhased()); - } - } - - genotypes.put(sample, pbtg); - - // Update the cache - if (/*rbpg.isPhased() &&*/ rbpg.isHet()) { - rbpCache.put(sample, rbpg); - pbtCache.put(sample, pbtg); - } else if (!rbpg.isPhased()) { - rbpCache.remove(sample); - pbtCache.remove(sample); - } - } - } - - VariantContext newvc = new VariantContext(SOURCE_NAME, pbt.getChr(), pbt.getStart(), pbt.getStart(), pbt.getAlleles(), genotypes, pbt.getNegLog10PError(), pbt.getFilters(), pbt.getAttributes()); - vcfWriter.add(newvc); - } - } - - return null; - } - - @Override - public Integer reduceInit() { - return null; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return null; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java deleted file mode 100644 index 809772c05b..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - - -/** - * Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}) -@By(DataSource.REFERENCE_ORDERED_DATA) - -public class MergeMNPsWalker extends RodWalker { - - @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter writer = null; - private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null; - - @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false) - protected int maxGenomicDistanceForMNP = 1; - - @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true) - public RodBinding variants; - - public void initialize() { - initializeVcfWriter(); - } - - private void initializeVcfWriter() { - // false <-> don't take control of writer, since didn't create it: - vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false); - writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter] - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName())); - vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(variants.getName()).getGenotypeSamples()))); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * For each site, send it to be (possibly) merged with previously observed sites. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return dummy Integer - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - for (VariantContext vc : tracker.getValues(variants, context.getLocation())) - writeVCF(vc); - - return 0; - } - - private void writeVCF(VariantContext vc) { - WriteVCF.writeVCF(vc, vcMergerWriter, logger); - } - - public Integer reduce(Integer result, Integer total) { - if (result == null) - return total; - - return total + result; - } - - /** - * Release any VariantContexts not yet processed. - * - * @param result Empty for now... - */ - public void onTraversalDone(Integer result) { - vcMergerWriter.close(); - - System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge()); - System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule()); - System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords()); - System.out.println(vcMergerWriter.getAltAlleleStats()); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java index 53cfaa3a95..2f15c165f3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, The Broad Institute + * Copyright (c) 2011, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -33,10 +33,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.FileNotFoundException; @@ -44,7 +41,7 @@ // Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts -public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { +class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private VCFWriter innerWriter; private GenomeLocParser genomeLocParser; @@ -52,7 +49,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private ReferenceSequenceFile referenceFileForMNPmerging; private VariantContextMergeRule vcMergeRule; - private VariantContextUtils.AlleleMergeRule alleleMergeRule; + private PhasingUtils.AlleleMergeRule alleleMergeRule; private String useSingleSample = null; @@ -71,7 +68,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { // Should we call innerWriter.close() in close() private boolean takeOwnershipOfInner; - public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { + public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, PhasingUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { this.innerWriter = innerWriter; this.genomeLocParser = genomeLocParser; try { @@ -122,7 +119,7 @@ public void add(VariantContext vc) { if (useSingleSample != null) { // only want to output context for one sample Genotype sampGt = vc.getGenotype(useSingleSample); if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP! - vc = vc.subContextFromGenotypes(sampGt); + vc = vc.subContextFromSample(sampGt.getSampleName()); else // asked for a sample that this vc does not contain, so ignore this vc: return; } @@ -179,14 +176,14 @@ else if (!emitOnlyMergedRecords) { // filtered records are never merged boolean mergedRecords = false; if (shouldAttemptToMerge) { numRecordsSatisfyingMergeRule++; - VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule); + VariantContext mergedVc = PhasingUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule); if (mergedVc != null) { mergedRecords = true; Map addedAttribs = vcMergeRule.addToMergedAttributes(vcfrWaitingToMerge.vc, vc); addedAttribs.putAll(mergedVc.getAttributes()); - mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs); + mergedVc = new VariantContextBuilder(mergedVc).attributes(addedAttribs).make(); vcfrWaitingToMerge = new VCFRecord(mergedVc, true); numMergedRecords++; @@ -218,26 +215,6 @@ private void stopWaitingToMerge() { filteredVcfrList.clear(); } - public int getNumRecordsAttemptToMerge() { - return numRecordsAttemptToMerge; - } - - public int getNumRecordsSatisfyingMergeRule() { - return numRecordsSatisfyingMergeRule; - } - - public int getNumMergedRecords() { - return numMergedRecords; - } - - public VariantContextMergeRule getVcMergeRule() { - return vcMergeRule; - } - - public VariantContextUtils.AlleleMergeRule getAlleleMergeRule() { - return alleleMergeRule; - } - /** * Gets a string representation of this object. * @@ -248,13 +225,6 @@ public String toString() { return getClass().getName(); } - public String getAltAlleleStats() { - if (altAlleleStats == null) - return ""; - - return "\n" + altAlleleStats.toString(); - } - private static class VCFRecord { public VariantContext vc; public boolean resultedFromMerge; @@ -373,7 +343,7 @@ else if (gt1.isNoCall() || gt1.isFiltered() || gt2.isNoCall() || gt2.isFiltered( if (shouldAttemptToMerge) { aas.numSuccessiveGenotypesAttemptedToBeMerged++; - if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) { + if (!PhasingUtils.alleleSegregationIsKnown(gt1, gt2)) { aas.segregationUnknown++; logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2)); } @@ -498,9 +468,9 @@ public int minDistance(VariantContext vc1, VariantContext vc2) { } -class ExistsDoubleAltAlleleMergeRule extends VariantContextUtils.AlleleMergeRule { +class ExistsDoubleAltAlleleMergeRule extends PhasingUtils.AlleleMergeRule { public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { - return VariantContextUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2); + return PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2); } public String toString() { @@ -515,7 +485,7 @@ public SegregatingMNPmergeAllelesRule() { public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { // Must be interesting AND consistent: - return super.allelesShouldBeMerged(vc1, vc2) && VariantContextUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2); + return super.allelesShouldBeMerged(vc1, vc2) && PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2); } public String toString() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java deleted file mode 100644 index 96d5c471ff..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - -/** - * Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}) -@By(DataSource.REFERENCE_ORDERED_DATA) - -public class MergeSegregatingAlternateAllelesWalker extends RodWalker { - - @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter writer = null; - private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null; - - @Argument(fullName = "maxGenomicDistance", shortName = "maxDist", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records; [default:1]", required = false) - protected int maxGenomicDistance = 1; - - @Argument(fullName = "useSingleSample", shortName = "useSample", doc = "Only output genotypes for the single sample given; [default:use all samples]", required = false) - protected String useSingleSample = null; - - @Hidden - @Argument(fullName = "emitOnlyMergedRecords", shortName = "emitOnlyMerged", doc = "Only output records that resulted from merging [For DEBUGGING purposes only - DO NOT USE, since it disregards the semantics of '|' as 'phased relative to previous non-filtered VC']; [default:false]", required = false) - protected boolean emitOnlyMergedRecords = false; - - @Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false) - protected boolean disablePrintAlternateAlleleStatistics = false; - - public final static String IGNORE_REFSEQ = "IGNORE"; - public final static String UNION_REFSEQ = "UNION"; - public final static String INTERSECT_REFSEQ = "INTERSECT"; - - @Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:" + IGNORE_REFSEQ + "]", required = false) - protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ; - - @Argument(fullName = "dontRequireSomeSampleHasDoubleAltAllele", shortName = "dontRequireSomeSampleHasDoubleAltAllele", doc = "Should the requirement, that SUCCESSIVE records to be merged have at least one sample with a double alternate allele, be relaxed?; [default:false]", required = false) - protected boolean dontRequireSomeSampleHasDoubleAltAllele = false; - - @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true) - public RodBinding variants; - - public void initialize() { - initializeVcfWriter(); - } - - private void initializeVcfWriter() { - GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser(); - - VariantContextMergeRule vcMergeRule; - if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ)) - vcMergeRule = new DistanceMergeRule(maxGenomicDistance, genomeLocParser); - else - vcMergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistance, genomeLocParser, mergeBasedOnRefSeqAnnotation); - - VariantContextUtils.AlleleMergeRule alleleMergeRule; - if (dontRequireSomeSampleHasDoubleAltAllele) // if a pair of VariantContext passes the vcMergeRule, then always merge them if there is a trailing prefix of polymorphisms (i.e., upstream polymorphic site): - alleleMergeRule = new PrefixPolymorphismMergeAllelesRule(); - else - alleleMergeRule = new ExistsDoubleAltAlleleMergeRule(); - - // false <-> don't take control of writer, since didn't create it: - vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, genomeLocParser, getToolkit().getArguments().referenceFile, vcMergeRule, alleleMergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics); - writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter] - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName())); - vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(variants.getName()).getGenotypeSamples()))); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * For each site, send it to be (possibly) merged with previously observed sites. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return dummy Integer - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - for (VariantContext vc : tracker.getValues(variants, context.getLocation())) - writeVCF(vc); - - return 0; - } - - private void writeVCF(VariantContext vc) { - WriteVCF.writeVCF(vc, vcMergerWriter, logger); - } - - public Integer reduce(Integer result, Integer total) { - if (result == null) - return total; - - return total + result; - } - - /** - * Release any VariantContexts not yet processed. - * - * @param result Empty for now... - */ - public void onTraversalDone(Integer result) { - vcMergerWriter.close(); - - if (useSingleSample != null) - System.out.println("Only considered single sample: " + useSingleSample); - - System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge()); - System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule()); - System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords()); - System.out.println(vcMergerWriter.getAltAlleleStats()); - } -} - - -enum MergeBasedOnRefSeqAnnotation { - UNION_WITH_DIST, INTERSECT_WITH_DIST -} - -class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule { - private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation; - - public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) { - super(maxGenomicDistanceForMNP, genomeLocParser); - - if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ)) - this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST; - else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ)) - this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST; - else - throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!"); - } - - public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) { - boolean withinDistance = super.shouldAttemptToMerge(vc1, vc2); - - if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST) - return withinDistance || sameGene(vc1, vc2); - else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST - return withinDistance && sameGene(vc1, vc2); - } - - private boolean sameGene(VariantContext vc1, VariantContext vc2) { - Set names_vc1 = RefSeqDataParser.getRefSeqNames(vc1); - Set names_vc2 = RefSeqDataParser.getRefSeqNames(vc2); - names_vc1.retainAll(names_vc2); - - if (!names_vc1.isEmpty()) - return true; - - // Check refseq.name2: - Set names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true); - Set names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true); - names2_vc1.retainAll(names2_vc2); - - return !names2_vc1.isEmpty(); - } - - public String toString() { - return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene"; - } - - public Map addToMergedAttributes(VariantContext vc1, VariantContext vc2) { - Map addedAttribs = super.addToMergedAttributes(vc1, vc2); - addedAttribs.putAll(RefSeqDataParser.getMergedRefSeqNameAttributes(vc1, vc2)); - return addedAttribs; - } -} - - - -class PrefixPolymorphismMergeAllelesRule extends VariantContextUtils.AlleleMergeRule { - public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { - return vc1.isPolymorphic(); - } - - public String toString() { - return super.toString() + ", there exists a polymorphism at the start of the merged allele"; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 3eedc2a280..cea7dd0070 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -7,38 +7,80 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.io.File; -import java.io.FileNotFoundException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.PrintStream; import java.util.*; /** - * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio, - * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype - * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents - * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the - * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to - * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally - * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or - * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to - * begin. + * Computes the most likely genotype combination and phases trios and parent/child pairs + * + *

+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases + * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. + * Ambiguous sites are: + *

    + *
  • Sites where all individuals are heterozygous
  • + *
  • Sites where there is a Mendelian violation
  • + *
+ * Missing genotypes are handled as follows: + *
    + *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • + *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • + *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
  • + *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • + *
+ * + *

Input

+ *

+ *

    + *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • + *
  • A PED pedigree file containing the description of the individuals relationships.
  • + *
+ *

+ * + *

Options

+ *

+ *

    + *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype + * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, + * and each individual genotype, depth, allelic depth and likelihoods.
  • + *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • + *
+ *

+ * + *

Output

+ *

+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PhaseByTransmission \
+ *   -V input.vcf \
+ *   -ped input.ped \
+ *   -o output.vcf
+ * 
+ * */ -public class PhaseByTransmission extends RodWalker { +public class PhaseByTransmission extends RodWalker, HashMap> { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.") - public ArrayList familySpecs = null; + @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") + private PrintStream mvFile = null; + + @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") + private double deNovoPrior=1e-8; @Output protected VCFWriter vcfWriter = null; @@ -46,241 +88,633 @@ public class PhaseByTransmission extends RodWalker { private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; private final String SOURCE_NAME = "PhaseByTransmission"; - private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; + public final double NO_TRANSMISSION_PROB = -1.0; + + private ArrayList trios = new ArrayList(); + + //Matrix of priors for all genotype combinations + private EnumMap>> mvCountMatrix; + + //Matrix of allele transmission + private EnumMap>> transmissionMatrix; + + //Metrics counters hash keys + private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; + private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; + private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; + private final Byte NUM_TRIO_HET_HET_HET = 3; + private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; + private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; + private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; + private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; + private final Byte NUM_PAIR_HET_HET = 8; + private final Byte NUM_PAIR_VIOLATIONS = 9; + private final Byte NUM_GENOTYPES_MODIFIED = 11; + + //Random number generator + private Random rand = new Random(); + + private enum FamilyMember { + MOTHER, + FATHER, + CHILD + } + + //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. + //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. + private class TrioPhase { - private class Trio { - private String mother; - private String father; - private String child; + //Create 2 fake alleles + //The actual bases will never be used but the Genotypes created using the alleles will be. + private final Allele REF = Allele.create("A",true); + private final Allele VAR = Allele.create("A",false); + private final Allele NO_CALL = Allele.create(".",false); + private final String DUMMY_NAME = "DummySample"; - public Trio(String mother, String father, String child) { - this.mother = mother; - this.father = father; - this.child = child; + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); + + private ArrayList getAlleles(Genotype.Type genotype){ + ArrayList alleles = new ArrayList(2); + if(genotype == Genotype.Type.HOM_REF){ + alleles.add(REF); + alleles.add(REF); + } + else if(genotype == Genotype.Type.HET){ + alleles.add(REF); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.HOM_VAR){ + alleles.add(VAR); + alleles.add(VAR); + } + else{ + return null; + } + return alleles; } - public Trio(String familySpec) { - String[] pieces = familySpec.split("[\\+\\=]"); + private boolean isPhasable(Genotype.Type genotype){ + return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR; + } - this.mother = pieces[0]; - this.father = pieces[1]; - this.child = pieces[2]; + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ + if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true)); + } + else + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false)); } - public String getMother() { return mother; } - public String getFather() { return father; } - public String getChild() { return child; } - } + //Find the phase for a parent/child pair + private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ - private ArrayList trios = new ArrayList(); + //Special case for Het/Het as it is ambiguous + if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); + return; + } - public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) { - if (familySpecs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // spec list set, and treat the entries as if they had been specified on the command line. - ArrayList specs = new ArrayList(); - for (String familySpec : familySpecs) { - File specFile = new File(familySpec); + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + + //If there is a possible phasing between the mother and child => phase + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); + } + //This is a Mendelian Violation => Do not phase + else{ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); + } + } - try { - XReadLines reader = new XReadLines(specFile); + //Phases a family by transmission + private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); + + //Build all possible child genotypes for the given parent's genotypes + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + } + } - List lines = reader.readLines(); - for (String line : lines) { - specs.add(new Trio(line)); - } - } catch (FileNotFoundException e) { - specs.add(new Trio(familySpec)); // not a file, so must be a family spec + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); + else + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); + + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); + else + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); + + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); + + //Once a phased combination is found; exit + return; } } - return specs; + //If this is reached then no phasing could be found + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false)); } - return new ArrayList(); + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ + public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Take care of cases where one or more family members are no call + if(!isPhasable(child)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else if(!isPhasable(mother)){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(!isPhasable(father)){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else + phasePairAlleles(father, child, FamilyMember.FATHER); + } + else if(!isPhasable(father)){ + phasePairAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + //All family members have genotypes and at least one of them is not Het + else{ + phaseFamilyAlleles(mother, father, child); + } + } + + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); + } + + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB) + phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb)); + + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) + return genotype; + + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); + + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + + } + + //Compute the new Log10Error if the genotype is different from the original genotype + double log10Error; + if(genotype.getType() == phasedGenotype.getType()) + log10Error = genotype.getLog10PError(); + else + log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); + + return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased()); + } + + } /** - * Parse the familial relationship specification, and initialize VCF writer + * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer */ public void initialize() { - trios = getFamilySpecsFromCommandLineInput(familySpecs); - ArrayList rodNames = new ArrayList(); rodNames.add(variantCollection.variants.getName()); - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + //Get the trios from the families passed as ped + setTrios(); + if(trios.size()<1) + throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); + + Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); - } - private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { - double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector()); - double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector()); - double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector()); + buildMatrices(); - int momIndex = mom.getType().ordinal() - 1; - int dadIndex = dad.getType().ordinal() - 1; - int childIndex = child.getType().ordinal() - 1; + if(mvFile != null) + mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL"); - return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex]; } - private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) { - List homRefAlleles = new ArrayList(); - homRefAlleles.add(refAllele); - homRefAlleles.add(refAllele); - Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); + /** + * Select trios and parent/child pairs only + */ + private void setTrios(){ + + Map> families = this.getSampleDB().getFamilies(); + Set family; + ArrayList parents; + for(String familyID : families.keySet()){ + family = families.get(familyID); + if(family.size()<2 || family.size()>3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size())); + } + else{ + for(Sample familyMember : family){ + parents = familyMember.getParents(); + if(parents.size()>0){ + if(family.containsAll(parents)) + this.trios.add(familyMember); + else + logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID)); + break; + } + } + } - List hetAlleles = new ArrayList(); - hetAlleles.add(refAllele); - hetAlleles.add(altAllele); - Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); + } - List homVarAlleles = new ArrayList(); - homVarAlleles.add(altAllele); - homVarAlleles.add(altAllele); - Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - ArrayList genotypes = new ArrayList(); - genotypes.add(homRef); - genotypes.add(het); - genotypes.add(homVar); - return genotypes; } - private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) { - List alleles = g.getAlleles(); - int matchingAlleles = 0; + //Create the transmission matrices + private void buildMatrices(){ + mvCountMatrix = new EnumMap>>(Genotype.Type.class); + transmissionMatrix = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mother : Genotype.Type.values()){ + mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + for(Genotype.Type father : Genotype.Type.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + for(Genotype.Type child : Genotype.Type.values()){ + mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); + transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); + } + } + } + } - for (Allele a : alleles) { - if (!alleleToMatch.equals(a)) { - matchingAlleles++; + //Returns the number of Mendelian Violations for a given genotype combination. + //If one of the parents genotype is missing, it will consider it as a parent/child pair + //If the child genotype or both parents genotypes are missing, 0 is returned. + private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Child is no call => No MV + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) + return 0; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) + parents.add(mother); + if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return 0; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(Genotype.Type parent : parents){ + if(parent == Genotype.Type.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == Genotype.Type.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == Genotype.Type.HOM_VAR){ + parentsNumAltAlleles++; } } - return matchingAlleles; - } + //Case Child is HomRef + if(child == Genotype.Type.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return 0; + else return (parents.size()-parentsNumRefAlleles); + } - private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) { - int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0; - int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0; + //Case child is HomVar + if(child == Genotype.Type.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return 0; + else return parents.size()-parentsNumAltAlleles; + } - int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0; - int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0; + //Case child is Het + if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return 0; - int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child); - int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child); + //MV + return 1; + } - return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles); + //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. + private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ + int count = 0; + if(motherOriginal!=motherNew) + count++; + if(fatherOriginal!=fatherNew) + count++; + if(childOriginal!=childNew) + count++; + return count; } - private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) { - Set possiblePhasedChildGenotypes = new HashSet(); + //Get a Map of genotype likelihoods. + //In case of null, unavailable or no call, all likelihoods are 1/3. + private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ + if(genotype == null || !genotype.isCalled()){ + EnumMap likelihoods = new EnumMap(Genotype.Type.class); + likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0); + likelihoods.put(Genotype.Type.HET,1.0/3.0); + likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0); + return likelihoods; + } + return genotype.getLikelihoods().getAsMap(true); + } - for (Allele momAllele : mom.getAlleles()) { - for (Allele dadAllele : dad.getAlleles()) { - ArrayList possiblePhasedChildAlleles = new ArrayList(); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(dadAllele); + //Returns the Genotype.Type; returns UNVAILABLE if given null + private Genotype.Type getTypeSafeNull(Genotype genotype){ + if(genotype == null) + return Genotype.Type.UNAVAILABLE; + return genotype.getType(); + } - Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true); - possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype); + /** + * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. + * @param ref: Reference allele + * @param alt: Alternative allele + * @param mother: Mother's genotype + * @param father: Father's genotype + * @param child: Child's genotype + * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child + * @return + */ + private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + + //Check whether it is a pair or trio + //Always assign the first parent as the parent having genotype information in pairs + //Always assign the mother as the first parent in trios + int parentsCalled = 0; + Map firstParentLikelihoods; + Map secondParentLikelihoods; + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); + ArrayList bestChildGenotype = new ArrayList(); + Genotype.Type pairSecondParentGenotype = null; + if(mother == null || !mother.isCalled()){ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + bestFirstParentGenotype.add(getTypeSafeNull(father)); + bestSecondParentGenotype.add(getTypeSafeNull(mother)); + pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); + if(father != null && father.isCalled()) + parentsCalled = 1; + } + else{ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); + if(father == null || !father.isCalled()){ + parentsCalled = 1; + pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); + }else{ + parentsCalled = 2; } } + Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); + bestChildGenotype.add(getTypeSafeNull(child)); + + //Prior vars + double bestConfigurationLikelihood = 0.0; + double norm = 0.0; + int configuration_index =0; + ArrayList bestMVCount = new ArrayList(); + bestMVCount.add(0); + + //Get the most likely combination + //Only check for most likely combination if at least a parent and the child have genotypes + if(child.isCalled() && parentsCalled > 0){ + int mvCount; + int cumulativeMVCount = 0; + double configurationLikelihood = 0; + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ + for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); + //For parent/child pairs, sum over the possible genotype configurations of the missing parent + if(parentsCalled<2){ + cumulativeMVCount += mvCount; + configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + } + //Evaluate configurations of trios + else{ + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(mvCount); + bestFirstParentGenotype.clear(); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.clear(); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(mvCount); + } + } + } + //Evaluate configurations of parent/child pairs + if(parentsCalled<2){ + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(cumulativeMVCount/3); + bestChildGenotype.clear(); + bestFirstParentGenotype.clear(); + bestSecondParentGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(cumulativeMVCount/3); + } + configurationLikelihood = 0; + } + } + } - ArrayList finalGenotypes = new ArrayList(); + //normalize the best configuration probability + bestConfigurationLikelihood = bestConfigurationLikelihood / norm; - for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) { - if (child.sameGenotype(phasedChildGenotype, true)) { - Allele momTransmittedAllele = phasedChildGenotype.getAllele(0); - Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1); + //In case of multiple equally likely combinations, take a random one + if(bestFirstParentGenotype.size()>1){ + configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); + } - ArrayList phasedMomAlleles = new ArrayList(); - phasedMomAlleles.add(momTransmittedAllele); - phasedMomAlleles.add(momUntransmittedAllele); + } + else{ + bestConfigurationLikelihood = NO_TRANSMISSION_PROB; + } - Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true); + TrioPhase phasedTrioGenotypes; + if(parentsCalled < 2 && mother == null || !mother.isCalled()) + phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + else + phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1); - Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1); + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return bestMVCount.get(configuration_index); - ArrayList phasedDadAlleles = new ArrayList(); - phasedDadAlleles.add(dadTransmittedAllele); - phasedDadAlleles.add(dadUntransmittedAllele); + } - Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true); - finalGenotypes.add(phasedMomGenotype); - finalGenotypes.add(phasedDadGenotype); - finalGenotypes.add(phasedChildGenotype); + private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ - return finalGenotypes; + //Increment metrics counters + if(parent.isCalled() && child.isCalled()){ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); + if(parent.isHet() && child.isHet()) + counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); } + }else{ + counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); } - finalGenotypes.add(mom); - finalGenotypes.add(dad); - finalGenotypes.add(child); - - return finalGenotypes; } - private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) { - ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mother); - finalGenotypes.add(father); - finalGenotypes.add(child); - - if (mother.isCalled() && father.isCalled() && child.isCalled()) { - ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother); - ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child); - - double bestConfigurationLikelihood = 0.0; - double bestPrior = 0.0; - Genotype bestMotherGenotype = mother; - Genotype bestFatherGenotype = father; - Genotype bestChildGenotype = child; - - double norm = 0.0; - - for (Genotype motherGenotype : possibleMotherGenotypes) { - for (Genotype fatherGenotype : possibleFatherGenotypes) { - for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype); - norm += prior*configurationLikelihood; - - if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { - bestConfigurationLikelihood = configurationLikelihood; - bestPrior = prior; - bestMotherGenotype = motherGenotype; - bestFatherGenotype = fatherGenotype; - bestChildGenotype = childGenotype; - } - } - } - } + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ - if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) { - Map attributes = new HashMap(); - attributes.putAll(bestChildGenotype.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); - bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes); + //Increment metrics counters + if(mother.isCalled() && father.isCalled() && child.isCalled()){ + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + + else{ + if(mvCount > 0){ + if(mvCount >1) + counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); - finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype); } + }else{ + counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); } - - return finalGenotypes; } /** @@ -292,55 +726,151 @@ private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype * @return null */ @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + String mvfLine; + if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); + VariantContextBuilder builder = new VariantContextBuilder(vc); - Map genotypeMap = vc.getGenotypes(); + GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes()); + for (Sample sample : trios) { + Genotype mother = vc.getGenotype(sample.getMaternalID()); + Genotype father = vc.getGenotype(sample.getPaternalID()); + Genotype child = vc.getGenotype(sample.getID()); - for (Trio trio : trios) { - Genotype mother = vc.getGenotype(trio.getMother()); - Genotype father = vc.getGenotype(trio.getFather()); - Genotype child = vc.getGenotype(trio.getChild()); + //Keep only trios and parent/child pairs + if(mother == null && father == null || child == null) + continue; - ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child); + ArrayList trioGenotypes = new ArrayList(3); + final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); Genotype phasedChild = trioGenotypes.get(2); - genotypeMap.put(phasedMother.getSampleName(), phasedMother); - genotypeMap.put(phasedFather.getSampleName(), phasedFather); - genotypeMap.put(phasedChild.getSampleName(), phasedChild); - } + //Fill the genotype map with the new genotypes and increment metrics counters + genotypesContext.replace(phasedChild); + if(mother != null){ + genotypesContext.replace(phasedMother); + if(father != null){ + genotypesContext.replace(phasedFather); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + } + else{ + updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); + if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } + } + else{ + genotypesContext.replace(phasedFather); + updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); + if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } - VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); + //Report violation if set so + //TODO: ADAPT FOR PAIRS TOO!! + if(mvCount>0 && mvFile != null) + mvFile.println(mvfLine); + } - vcfWriter.add(newvc); + builder.genotypes(genotypesContext); + vcfWriter.add(builder.make()); } - - return null; + return metricsCounters; } /** - * Provide an initial value for reduce computations. + * Initializes the reporting counters. * - * @return Initial value of reduce. + * @return All counters initialized to 0 */ @Override - public Integer reduceInit() { - return null; + public HashMap reduceInit() { + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + + return metricsCounters; } /** - * Reduces a single map with the accumulator provided as the ReduceType. + * Adds the value of the site phased to the reporting counters. * - * @param value result of the map. - * @param sum accumulator for the reduce. + * @param value Site values + * @param sum accumulator for the reporting counters * @return accumulator with result of the map taken into account. */ @Override - public Integer reduce(Integer value, Integer sum) { - return null; + public HashMap reduce(HashMap value, HashMap sum) { + sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); + sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); + sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); + sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); + sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); + sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); + sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); + sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); + sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); + sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); + sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); + + return sum; + } + + + /** + * Reports statistics on the phasing by transmission process. + * @param result Accumulator with all counters. + */ + @Override + public void onTraversalDone(HashMap result) { + logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); + logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); + logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); + logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); + logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); + logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); + logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); + logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java index fe27924755..8f980ad721 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraph.java @@ -23,12 +23,10 @@ */ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.sting.utils.DisjointSet; - import java.util.*; // Represents an undirected graph with no self-edges: -public class PhasingGraph implements Iterable { +class PhasingGraph implements Iterable { private Neighbors[] adj; public PhasingGraph(int numVertices) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java index 56197a85f3..053b09439f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingGraphEdge.java @@ -26,7 +26,7 @@ /* Edge class for PhasingGraph */ -public class PhasingGraphEdge implements Comparable { +class PhasingGraphEdge implements Comparable { protected int v1; protected int v2; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java index 63fb332950..a95b13d682 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -29,7 +29,7 @@ import java.util.Arrays; -public class PhasingRead extends BaseArray { +class PhasingRead extends BaseArray { private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java new file mode 100644 index 0000000000..75d0773f1b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +class PhasingUtils { + static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { + if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) + return null; + + // Check that it's logically possible to merge the VCs: + if (!allSamplesAreMergeable(vc1, vc2)) + return null; + + // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) + if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) + return null; + + return reallyMergeIntoMNP(vc1, vc2, referenceFile); + } + + static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { + int startInter = vc1.getEnd() + 1; + int endInter = vc2.getStart() - 1; + byte[] intermediateBases = null; + if (startInter <= endInter) { + intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); + StringUtil.toUpperCase(intermediateBases); + } + MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added + + GenotypesContext mergedGenotypes = GenotypesContext.create(); + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + List mergedAllelesForSample = new LinkedList(); + + /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, + we preserve phase information (if any) relative to whatever precedes vc1: + */ + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() + + Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); + mergedAllelesForSample.add(mergedAllele); + } + + double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError()); + Set mergedGtFilters = new HashSet(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered + + Map mergedGtAttribs = new HashMap(); + PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); + if (phaseQual.PQ != null) + mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); + + Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); + mergedGenotypes.add(mergedGt); + } + + String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource()); + double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError()); + Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered + Map mergedAttribs = mergeVariantContextAttributes(vc1, vc2); + + // ids + List mergedIDs = new ArrayList(); + if ( vc1.hasID() ) mergedIDs.add(vc1.getID()); + if ( vc2.hasID() ) mergedIDs.add(vc2.getID()); + String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs); + + VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs); + VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true); + return mergedBuilder.make(); + } + + static String mergeVariantContextNames(String name1, String name2) { + return name1 + "_" + name2; + } + + static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { + Map mergedAttribs = new HashMap(); + + List vcList = new LinkedList(); + vcList.add(vc1); + vcList.add(vc2); + + String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; + for (String orAttrib : MERGE_OR_ATTRIBS) { + boolean attribVal = false; + for (VariantContext vc : vcList) { + attribVal = vc.getAttributeAsBoolean(orAttrib, false); + if (attribVal) // already true, so no reason to continue: + break; + } + mergedAttribs.put(orAttrib, attribVal); + } + + return mergedAttribs; + } + + static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { + GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); + GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); + + if (!loc1.onSameContig(loc2)) + throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); + + if (!loc1.isBefore(loc2)) + throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); + + if (vc1.isFiltered() || vc2.isFiltered()) + return false; + + if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets + return false; + + if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) + return false; + + return true; + } + + static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) { + for (final Genotype gt : vc.getGenotypes()) { + if (gt.isNoCall() || gt.isFiltered()) + return false; + } + + return true; + } + + static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { + // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom + return false; + } + + return true; + } + + static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { + if (gt1.getPloidy() != gt2.getPloidy()) + return false; + + /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. + + HOWEVER, EVEN if this is not the case, but gt1.isHom(), + it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. + */ + return (gt2.isPhased() || gt2.isHom() || gt1.isHom()); + } + + static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { + if (gt2.isPhased() || gt2.isHom()) + return new PhaseAndQuality(gt1); // maintain the phase of gt1 + + if (!gt1.isHom()) + throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); + + /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype + + For example, if we're merging the third Genotype with the second one: + 0/1 + 1|1 + 0/1 + + Then, we want to output: + 0/1 + 1/2 + */ + return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()] + } + + static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() + + if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate + return true; + } + } + + return false; + } + + static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { + // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): + Map allele1ToAllele2 = new HashMap(); + Map allele2ToAllele1 = new HashMap(); + + // Note the segregation of the alleles for the reference genome: + allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); + allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); + + // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). + for (final Genotype gt1 : vc1.getGenotypes()) { + Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); + + Allele all1To2 = allele1ToAllele2.get(all1); + if (all1To2 == null) + allele1ToAllele2.put(all1, all2); + else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 + return false; + + Allele all2To1 = allele2ToAllele1.get(all2); + if (all2To1 == null) + allele2ToAllele1.put(all2, all1); + else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 + return false; + } + } + + return true; + } + + abstract static class AlleleMergeRule { + // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): + abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); + + public String toString() { + return "all samples are mergeable"; + } + } + + static class AlleleOneAndTwo { + private Allele all1; + private Allele all2; + + public AlleleOneAndTwo(Allele all1, Allele all2) { + this.all1 = all1; + this.all2 = all2; + } + + public int hashCode() { + return all1.hashCode() + all2.hashCode(); + } + + public boolean equals(Object other) { + if (!(other instanceof AlleleOneAndTwo)) + return false; + + AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; + return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); + } + } + + static class MergedAllelesData { + private Map mergedAlleles; + private byte[] intermediateBases; + private int intermediateLength; + + public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { + this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo + this.intermediateBases = intermediateBases; + this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; + + this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); + } + + public Allele ensureMergedAllele(Allele all1, Allele all2) { + return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor + } + + private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { + AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); + Allele mergedAllele = mergedAlleles.get(all12); + + if (mergedAllele == null) { + byte[] bases1 = all1.getBases(); + byte[] bases2 = all2.getBases(); + + byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; + System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); + if (intermediateBases != null) + System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); + System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); + + mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); + mergedAlleles.put(all12, mergedAllele); + } + + return mergedAllele; + } + + public Set getAllMergedAlleles() { + return new HashSet(mergedAlleles.values()); + } + } + + static class PhaseAndQuality { + public boolean isPhased; + public Double PQ = null; + + public PhaseAndQuality(Genotype gt) { + this.isPhased = gt.isPhased(); + if (this.isPhased) { + this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + if ( this.PQ == -1 ) this.PQ = null; + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java index 99446705e5..b68739b48e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PreciseNonNegativeDouble.java @@ -26,7 +26,7 @@ /* PreciseNonNegativeDouble permits arithmetic operations on NON-NEGATIVE double values with precision (prevents underflow by representing in log10 space). */ -public class PreciseNonNegativeDouble implements Comparable { +class PreciseNonNegativeDouble implements Comparable { private static final double EQUALS_THRESH = 1e-6; private static final double INFINITY = Double.POSITIVE_INFINITY; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 68fbe8ce21..9470ce2f48 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -34,17 +34,13 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.DisjointSet; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -125,7 +121,8 @@ public class ReadBackedPhasingWalker extends RodWalker samplesToPhase = null; + protected Set + samplesToPhase = null; private GenomeLoc mostDownstreamLocusReached = null; @@ -275,10 +272,10 @@ else if (context.hasExtendedEventPileup()) { private static final Set KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); - private VariantContext reduceVCToSamples(VariantContext vc, List samplesToPhase) { + private VariantContext reduceVCToSamples(VariantContext vc, Set samplesToPhase) { // for ( String sample : samplesToPhase ) // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); - VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values()); + VariantContext subvc = vc.subContextFromSamples(samplesToPhase); // logger.debug("original VC = " + vc); // logger.debug("sub VC = " + subvc); return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); @@ -355,17 +352,16 @@ private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) { UnfinishedVariantContext uvc = uvr.unfinishedVariant; // Perform per-sample phasing: - Map sampGenotypes = vc.getGenotypes(); + GenotypesContext sampGenotypes = vc.getGenotypes(); Map samplePhaseStats = new TreeMap(); - for (Map.Entry sampGtEntry : sampGenotypes.entrySet()) { - String samp = sampGtEntry.getKey(); - Genotype gt = sampGtEntry.getValue(); + for (final Genotype gt : sampGenotypes) { + String samp = gt.getSampleName(); if (DEBUG) logger.debug("sample = " + samp); if (isUnfilteredCalledDiploidGenotype(gt)) { if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site: // true <-> can trivially phase a hom site relative to ANY previous site: - Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getNegLog10PError(), gt.getFilters(), gt.getAttributes(), true); + Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getLog10PError(), gt.getFilters(), gt.getAttributes(), true); uvc.setGenotype(samp, phasedGt); } else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype @@ -401,7 +397,7 @@ else if (gt.isHet()) { // Attempt to phase this het genotype relative to the pre ensurePhasing(allelePair, prevAllelePair, pr.haplotype); Map gtAttribs = new HashMap(gt.getAttributes()); gtAttribs.put(PQ_KEY, pr.phaseQuality); - Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); + Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); uvc.setGenotype(samp, phasedGt); } @@ -421,7 +417,7 @@ else if (gt.isHet()) { // Attempt to phase this het genotype relative to the pre if (genotypesArePhased) { Map handledGtAttribs = new HashMap(handledGt.getAttributes()); handledGtAttribs.put(PQ_KEY, pr.phaseQuality); - Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getNegLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased); + Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased); interiorUvc.setGenotype(samp, phasedHomGt); } } @@ -1055,7 +1051,7 @@ private void writeVcList(List varContList) { private void writeVCF(VariantContext vc) { if (samplesToPhase == null || vc.isNotFiltered()) //if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant - WriteVCF.writeVCF(vc, writer, logger); + writer.add(vc); } public static boolean processVariantInPhasing(VariantContext vc) { @@ -1126,25 +1122,34 @@ private class UnfinishedVariantContext implements HasGenomeLocation { private int start; private int stop; private Collection alleles; - private Map genotypes; - private double negLog10PError; + private Map genotypes; + private double log10PError; private Set filters; private Map attributes; + private String id; public UnfinishedVariantContext(VariantContext vc) { this.name = vc.getSource(); + this.id = vc.getID(); this.contig = vc.getChr(); this.start = vc.getStart(); this.stop = vc.getEnd(); this.alleles = vc.getAlleles(); - this.genotypes = new HashMap(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable - this.negLog10PError = vc.getNegLog10PError(); + + this.genotypes = new HashMap(); + for ( final Genotype g : vc.getGenotypes() ) { + this.genotypes.put(g.getSampleName(), g); + } + + this.log10PError = vc.getLog10PError(); this.filters = vc.filtersWereApplied() ? vc.getFilters() : null; this.attributes = new HashMap(vc.getAttributes()); } public VariantContext toVariantContext() { - return new VariantContext(name, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); + GenotypesContext gc = GenotypesContext.copy(this.genotypes.values()); + return new VariantContextBuilder(name, contig, start, stop, alleles).id(id) + .genotypes(gc).log10PError(log10PError).filters(filters).attributes(attributes).make(); } public GenomeLoc getLocation() { @@ -1156,7 +1161,7 @@ public Genotype getGenotype(String sample) { } public void setGenotype(String sample, Genotype newGt) { - genotypes.put(sample, newGt); + this.genotypes.put(sample, newGt); } public void setPhasingInconsistent() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java index ae15c3f128..bff45de7fb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBase.java @@ -23,7 +23,7 @@ */ package org.broadinstitute.sting.gatk.walkers.phasing; -public class ReadBase { +class ReadBase { public String readName; public byte base; public int mappingQual; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java index e5652c56ef..813bc2e947 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBasesAtPosition.java @@ -28,7 +28,7 @@ import java.util.Iterator; import java.util.LinkedList; -public class ReadBasesAtPosition implements Iterable { +class ReadBasesAtPosition implements Iterable { // list of: private LinkedList bases; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java deleted file mode 100644 index f941408146..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields: - */ -public class RefSeqDataParser { - private static String REFSEQ_PREFIX = "refseq."; - - private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords"; - private static String NAME_KEY = REFSEQ_PREFIX + "name"; - private static String NAME2_KEY = REFSEQ_PREFIX + "name2"; - - private static String[] NAME_KEYS = {NAME_KEY, NAME2_KEY}; - - private static Map getRefSeqEntriesToNames(VariantContext vc, boolean getName2) { - String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY; - String nameKeyToUseMultiplePrefix = nameKeyToUse + "_"; - - Map entriesToNames = new HashMap(); - int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1); - if (numRecords != -1) { - boolean done = false; - - if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1": - String name = vc.getAttributeAsString(nameKeyToUse, null); - if (name != null) { - entriesToNames.put(nameKeyToUse, name); - done = true; - } - } - - if (!done) { - for (int i = 1; i <= numRecords; i++) { - String key = nameKeyToUseMultiplePrefix + i; - String name = vc.getAttributeAsString(key, null); - if (name != null) - entriesToNames.put(key, name); - } - } - } - else { // no entry with the # of records: - String name = vc.getAttributeAsString(nameKeyToUse, null); - if (name != null) { - entriesToNames.put(nameKeyToUse, name); - } - else { // Check all INFO fields for a match (if there are multiple entries): - for (Map.Entry entry : vc.getAttributes().entrySet()) { - String key = entry.getKey(); - if (key.startsWith(nameKeyToUseMultiplePrefix)) - entriesToNames.put(key, entry.getValue().toString()); - } - } - } - return entriesToNames; - } - - private static Map getRefSeqEntriesToNames(VariantContext vc) { - return getRefSeqEntriesToNames(vc, false); - } - - public static Set getRefSeqNames(VariantContext vc, boolean getName2) { - return new TreeSet(getRefSeqEntriesToNames(vc, getName2).values()); - } - - public static Set getRefSeqNames(VariantContext vc) { - return getRefSeqNames(vc, false); - } - - public static Map getMergedRefSeqNameAttributes(VariantContext vc1, VariantContext vc2) { - Map refSeqNameAttribs = new HashMap(); - - Map entriesMap1 = getAllRefSeqEntriesByName(vc1); - Map entriesMap2 = getAllRefSeqEntriesByName(vc2); - - Set commonNames = entriesMap1.keySet(); - commonNames.retainAll(entriesMap2.keySet()); - boolean addSuffix = commonNames.size() > 1; - int nextCount = 1; - - for (String name : commonNames) { - RefSeqEntry refseq1 = entriesMap1.get(name); - RefSeqEntry refseq2 = entriesMap2.get(name); - - String keySuffix = ""; - if (addSuffix) - keySuffix = "_" + nextCount; - - boolean added = false; - for (String key : NAME_KEYS) { - Object obj1 = refseq1.info.get(key); - Object obj2 = refseq2.info.get(key); - if (obj1 != null && obj2 != null && obj1.equals(obj2)) { - added = true; - String useKey = key + keySuffix; - refSeqNameAttribs.put(useKey, obj1); - } - } - if (added) - nextCount++; - } - int totalCount = nextCount - 1; // since incremented count one extra time - if (totalCount > 1) - refSeqNameAttribs.put(NUM_RECORDS_KEY, totalCount); - - return refSeqNameAttribs; - } - - public static Map removeRefSeqAttributes(Map attributes) { - Map removedRefSeqAttributes = new HashMap(attributes); - - Iterator> attrIt = removedRefSeqAttributes.entrySet().iterator(); - while (attrIt.hasNext()) { - String key = attrIt.next().getKey(); - if (key.startsWith(REFSEQ_PREFIX)) - attrIt.remove(); - } - - return removedRefSeqAttributes; - } - - private static Map getAllRefSeqEntriesByName(VariantContext vc) { - Map nameToEntries = new TreeMap(); - - List allEntries = getAllRefSeqEntries(vc); - for (RefSeqEntry entry : allEntries) { - Object name = entry.info.get(NAME_KEY); - if (name != null) - nameToEntries.put(name.toString(), entry); - } - - return nameToEntries; - } - - // Returns a List of SEPARATE Map for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc. - private static List getAllRefSeqEntries(VariantContext vc) { - List allRefSeq = new LinkedList(); - - for (Map.Entry entryToName : getRefSeqEntriesToNames(vc).entrySet()) { - String entry = entryToName.getKey(); - String entrySuffix = entry.replaceFirst(NAME_KEY, ""); - allRefSeq.add(new RefSeqEntry(vc, entrySuffix)); - } - - return allRefSeq; - } - - private static class RefSeqEntry { - public Map info; - - public RefSeqEntry(VariantContext vc, String entrySuffix) { - this.info = new HashMap(); - - for (Map.Entry attribEntry : vc.getAttributes().entrySet()) { - String key = attribEntry.getKey(); - if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) { - String genericKey = key.replaceAll(entrySuffix, ""); - this.info.put(genericKey, attribEntry.getValue()); - } - } - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java index 153c4a23f0..6a2381e295 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java @@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; -public class SNPallelePair extends AllelePair { +class SNPallelePair extends AllelePair { public SNPallelePair(Genotype gt) { super(gt); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index e10334a777..6b4fec04e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -6,9 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.Arrays; import java.util.EnumSet; -import java.util.List; /* * Copyright (c) 2009 The Broad Institute diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java index fd55d78a02..f370e2818a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -39,8 +39,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.Map; @@ -365,7 +365,7 @@ public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, Alignm return counter; // Do not operate on variants that are not covered to the optional minimum depth - if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { + if (!context.hasReads() || !context.hasBasePileup() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { counter.nUncovered = 1L; if (vcComp.getAttribute("GV").equals("T")) counter.nAltNotCalled = 1L; @@ -466,9 +466,7 @@ else if (vcComp.getAttribute("GV").equals("F")) if (vcfWriter != null && writeVariant) { if (!vcComp.hasAttribute("callStatus")) { - MutableVariantContext mvc = new MutableVariantContext(vcComp); - mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" ); - vcfWriter.add(mvc); + vcfWriter.add(new VariantContextBuilder(vcComp).attribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF").make()); } else vcfWriter.add(vcComp); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 035d8d2ca9..b27bef2650 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.bwa.BWTFiles; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -126,6 +123,11 @@ public class ValidationAmplicons extends RodWalker { @Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false) boolean doNotUseBWA = false; + @Hidden + @Argument(doc="Use Sequenom output format instead of regular FASTA",fullName="sqnm",required=false) + boolean sequenomOutput = false; + + GenomeLoc prevInterval; GenomeLoc allelePos; String probeName; @@ -258,7 +260,7 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo } } } else /* (mask != null && validate == null ) */ { - if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) { + if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )) { logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles()))); sequenceInvalid = true; @@ -279,7 +281,7 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo sequence.append('N'); indelCounter--; rawSequence.append(Character.toUpperCase((char)ref.getBase())); - } else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){ + } else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphicInSamples() )){ logger.debug("SNP in mask found at " + ref.getLocus().toString()); if ( lowerCaseSNPs ) { @@ -485,6 +487,13 @@ public void print() { } String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); - out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + + if (!sequenomOutput) + out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + else { + seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record + probeName = probeName.replace("amplicon_","a"); + out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java similarity index 55% rename from public/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java index 2a043466af..62305d3c07 100644 --- a/public/java/src/org/broadinstitute/sting/utils/yaml/FieldOrderComparator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java @@ -21,32 +21,27 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; -package org.broadinstitute.sting.utils.yaml; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import org.yaml.snakeyaml.introspector.Property; - -import java.lang.reflect.Field; import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; +import java.util.HashMap; -/** - * Orders properties based on the order of the fields in the Java Bean. - */ -class FieldOrderComparator implements Comparator { - private final List propertyOrder; +public abstract class FrequencyModeSelector implements Cloneable{ - public FieldOrderComparator(Class clazz) { - propertyOrder = new ArrayList(); - for (Field field : clazz.getDeclaredFields()) - propertyOrder.add(field.getName()); - } + protected GenomeLocParser parser; - @Override - public int compare(Property one, Property two) { - Integer index1 = propertyOrder.indexOf(one.getName()); - Integer index2 = propertyOrder.indexOf(two.getName()); - return index1.compareTo(index2); + protected FrequencyModeSelector(GenomeLocParser parser) { + this.parser = parser; } + protected void logCurrentSiteData(VariantContext vc, boolean passesCriteria) { + logCurrentSiteData(vc, passesCriteria, false, false); + } + protected abstract void logCurrentSiteData(VariantContext vc, boolean included, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC); + protected abstract ArrayList selectValidationSites(int numValidationSites); + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java new file mode 100644 index 0000000000..ff3fe65061 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; +import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + + +public class GLBasedSampleSelector extends SampleSelector { + Map numAllelePriorMatrix = new HashMap(); + double referenceLikelihood; + public GLBasedSampleSelector(TreeSet sm, double refLik) { + super(sm); + referenceLikelihood = refLik; + } + + public boolean selectSiteInSamples(VariantContext vc) { + if ( samples == null || samples.isEmpty() ) + return true; + // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model) + // first subset to the samples + VariantContext subContext = vc.subContextFromSamples(samples); + + // now check to see (using EXACT model) whether this should be variant + // do we want to apply a prior? maybe user-spec? + double[][] flatPrior = createFlatPrior(vc.getAlleles()); + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); + ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); + // do we want to let this qual go up or down? + if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { + return true; + } + + return false; + } + + private double[][] createFlatPrior(List alleles) { + if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) { + numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]); + } + + return numAllelePriorMatrix.get(alleles.size()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java similarity index 54% rename from public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java index e0e75c3534..c3987b9dbe 100644 --- a/public/java/src/org/broadinstitute/sting/pipeline/Pipeline.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java @@ -22,41 +22,34 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.pipeline; +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; -/** - * Java bean for storing a list of samples for a pipeline. - * - * NOTE: This class is used in a very similar way to the classes in - * org.broadinstitute.sting.gatk.datasources.sample. - * - * Both store / load sample information from the file system as YAML. - * - * This package will likely be refactored to share common functionality - * with the other at a future date as requirements coalesce. - * - * - kshakir September 22, 2010 - */ -public class Pipeline { - private PipelineProject project = new PipelineProject(); - private List samples = new ArrayList(); - public PipelineProject getProject() { - return project; +public class GTBasedSampleSelector extends SampleSelector{ + public GTBasedSampleSelector(TreeSet sm) { + super(sm); } - public void setProject(PipelineProject project) { - this.project = project; - } + public boolean selectSiteInSamples(VariantContext vc) { + // Super class already defined initialization which filled data structure "samples" with desired samples. + // We only need to check if current vc if polymorphic in that set of samples - public List getSamples() { - return samples; - } + if ( samples == null || samples.isEmpty() ) + return true; + + VariantContext subContext = vc.subContextFromSamples(samples, vc.getAlleles()); + if ( subContext.isPolymorphicInSamples() ) { + return true; + } - public void setSamples(List samples) { - this.samples = samples; + return false; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java new file mode 100644 index 0000000000..af6a520026 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.util.HashMap; +import java.util.List; + + +public class GenomeEvent implements Comparable { + final protected GenomeLoc loc; + /** A set of the alleles segregating in this context */ + final protected List alleles; + final protected Byte refBase; +// final protected HashMap attributes; + + public GenomeEvent(GenomeLocParser parser, final String contig, final int start, final int stop, final List alleles, HashMap attributes, + byte base) { + this.loc = parser.createGenomeLoc(contig, start, stop); + this.alleles = alleles; + this.refBase = base; +// this.attributes = attributes; + } + + // Routine to compare two variant contexts (useful to sort collections of vc's). + // By default, we want to sort first by contig, then by start location + + public GenomeLoc getGenomeLoc() { + return loc; + } + public int compareTo(final Object o) { + if (!(o instanceof GenomeEvent)) + throw new ReviewedStingException("BUG: comparing variant context with non-VC object"); + + GenomeEvent otherEvent = (GenomeEvent)o; + + return loc.compareTo(otherEvent.getGenomeLoc()); + } + + public VariantContext createVariantContextFromEvent() { + return new VariantContextBuilder("event", loc.getContig(), loc.getStart(), loc.getStop(), alleles) + .log10PError(0.0).referenceBaseForIndel(refBase).make(); + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java new file mode 100644 index 0000000000..15274d21c7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + + +public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector { + + private static final boolean DEBUG = true; + + private int NUM_BINS = 20; + + private int[] preSampleSelectionHistogram; + private int numTotalSites = 0; + private int[] postSampleSelectionHistogram; + private int numSampleSelectedSites = 0; + private ArrayList[] binnedEventArray; + + public KeepAFSpectrumFrequencySelector(int numBins, GenomeLocParser parser) { + super(parser); + NUM_BINS = numBins; + // initialize arrays dependent on NUM_BINS + binnedEventArray = new ArrayList[NUM_BINS]; + + for (int k=0; k < NUM_BINS; k++) + binnedEventArray[k] = new ArrayList(); + + preSampleSelectionHistogram = new int[NUM_BINS]; + postSampleSelectionHistogram = new int[NUM_BINS]; + } + + public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + + // this method is called for every variant of a selected type, regardless of whether it will be selectable or not + // get AC,AF,AN attributes from vc + HashMap attributes = new HashMap(); + double[] afArray = null; + + if (vc.hasGenotypes() && !IGNORE_GENOTYPES) { + // recompute AF,AC,AN based on genotypes: + // todo - - maybe too inefficient?? + VariantContextUtils.calculateChromosomeCounts(vc, attributes, false); + afArray = new double[] {Double.valueOf((String)attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY))}; + } else { + // sites-only vc or we explicitly tell to ignore genotypes; we trust the AF field if present + if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { + String afo = vc.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null); + + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + + afArray = new double[afs.length]; + + for (int k=0; k < afArray.length; k++) + afArray[k] = Double.valueOf(afs[k]); + } + else + afArray = new double[] {Double.valueOf(afo)}; + } + } + + + if (afArray == null ) + return; + + double af0 = MathUtils.arrayMax(afArray); + + int binIndex = (NUM_BINS-1) - (int) Math.floor(((1.0-af0)*NUM_BINS)); + // deal with round-off issue: low-AC sites with large samples can have AF rounded down to 0.000 + if (binIndex < 0) + binIndex = 0; +// System.out.format("Pre:%4.4f %d\n",af0, binIndex); + preSampleSelectionHistogram[binIndex]++; + numTotalSites++; + + // now process VC subsetted to samples of interest + if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC) + return; + + //System.out.format("Post:%4.4f %d\n",af0, binIndex); + postSampleSelectionHistogram[binIndex]++; + numSampleSelectedSites++; + + // create bare-bones event and log in corresponding bin + // attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes + GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel()); + + binnedEventArray[binIndex].add(event); + + } + + public ArrayList selectValidationSites(int numValidationSites) { + // number of sites to choose at random for each frequency bin = #desired validation sites/# total sites * #sites in original bin + int[] sitesToChoosePerBin = new int[NUM_BINS]; + int totalSites = 0; + for (int k=0; k < NUM_BINS; k++) { + int sites = (int)Math.round((double)numValidationSites * preSampleSelectionHistogram[k]/ (double)numTotalSites); + sitesToChoosePerBin[k] = sites; + totalSites += sites; + } + + // deal with rounding artifacts + while (totalSites > numValidationSites) { + // take off one from randomly selected bin + int k= GenomeAnalysisEngine.getRandomGenerator().nextInt(NUM_BINS); + sitesToChoosePerBin[k]--; + totalSites--; + } + while (totalSites < numValidationSites) { + // take off one from randomly selected bin + int k= GenomeAnalysisEngine.getRandomGenerator().nextInt( NUM_BINS); + sitesToChoosePerBin[k]++; + totalSites++; + } + + if (DEBUG) { + System.out.println("sitesToChoosePerBin:"); + for (int k=0; k < NUM_BINS; k++) + System.out.format("%d ", sitesToChoosePerBin[k]); + System.out.println(); + + System.out.println("preSampleSelectionHistogram:"); + for (int k=0; k < NUM_BINS; k++) + System.out.format("%d ", preSampleSelectionHistogram[k]); + System.out.println(); + + System.out.println("postSampleSelectionHistogram:"); + for (int k=0; k < NUM_BINS; k++) + System.out.format("%d ", postSampleSelectionHistogram[k]); + System.out.println(); + + } + + // take randomly sitesToChoosePerBin[k] elements from each bin + ArrayList selectedEvents = new ArrayList(); + + for (int k=0; k < NUM_BINS; k++) { + selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray[k], sitesToChoosePerBin[k])); + } + + Collections.sort(selectedEvents); + + // now convert to VC + ArrayList selectedSites = new ArrayList(); + for (GenomeEvent event : selectedEvents) + selectedSites.add(event.createVariantContextFromEvent()); + + return selectedSites; + + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java similarity index 78% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java index c10eaa2da6..a48bcb8a17 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java @@ -21,14 +21,19 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.phasing; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.TreeSet; + + +public class NullSampleSelector extends SampleSelector{ + public NullSampleSelector(TreeSet sm) { + super(sm); + } -public class WriteVCF { - public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) { - writer.add(vc); + public boolean selectSiteInSamples(VariantContext vc) { + return true; } } diff --git a/public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java similarity index 58% rename from public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java index 7cd25fed55..afbff93d04 100644 --- a/public/java/src/org/broadinstitute/sting/pipeline/PipelineSample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java @@ -21,42 +21,20 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; -package org.broadinstitute.sting.pipeline; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.TreeSet; -import java.io.File; -import java.util.Map; -import java.util.TreeMap; -/** - * Java bean defining a sample for a pipeline. - */ -public class PipelineSample { - private String id; - private Map bamFiles = new TreeMap(); - private Map tags = new TreeMap(); - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } +public abstract class SampleSelector implements Cloneable { - public Map getBamFiles() { - return bamFiles; + TreeSet samples; + protected SampleSelector(TreeSet sm) { + samples = new TreeSet(sm); } - public void setBamFiles(Map bamFiles) { - this.bamFiles = bamFiles; - } + protected abstract boolean selectSiteInSamples(VariantContext vc); - public Map getTags() { - return tags; - } - public void setTags(Map tags) { - this.tags = tags; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java new file mode 100644 index 0000000000..66720a2528 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + +public class UniformSamplingFrequencySelector extends FrequencyModeSelector { + private ArrayList binnedEventArray; + + public UniformSamplingFrequencySelector(GenomeLocParser parser) { + super(parser); + binnedEventArray = new ArrayList(); + + } + + public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + HashMap attributes = new HashMap(); + + + if (vc.hasGenotypes() && !IGNORE_GENOTYPES) { + // recompute AF,AC,AN based on genotypes: + VariantContextUtils.calculateChromosomeCounts(vc, attributes, false); + if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC) + return; + } else { + if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) { + int ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); + if (ac == 0) return; // site not polymorphic + } + else + return; + + } + // create bare-bones event and log in corresponding bin + // attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes + GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel()); + binnedEventArray.add(event); + + } + + public ArrayList selectValidationSites(int numValidationSites) { + + // take randomly sitesToChoosePerBin[k] elements from each bin + ArrayList selectedEvents = new ArrayList(); + + selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray, numValidationSites)); + + Collections.sort(selectedEvents); + + // now convert to VC + ArrayList selectedSites = new ArrayList(); + for (GenomeEvent event : selectedEvents) + selectedSites.add(event.createVariantContextFromEvent()); + + return selectedSites; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java new file mode 100644 index 0000000000..ae11d8102e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.File; +import java.util.*; + + +/** + * Randomly selects VCF records according to specified options. + * + *

+ * ValidationSiteSelectorWalker is intended for use in experiments where we sample data randomly from a set of variants, for example + * in order to choose sites for a follow-up validation study. + * + * Sites are selected randomly but within certain restrictions. There are two main sources of restrictions + * a) Sample restrictions. A user can specify a set of samples, and we will only consider sites which are polymorphic within such given sample subset. + * These sample restrictions can be given as a set of individual samples, a text file (each line containing a sample name), or a regular expression. + * A user can additionally specify whether samples will be considered based on their genotypes (a non-reference genotype means that such sample is polymorphic in that variant, + * and hence that variant will be considered for inclusion in set), or based on their PLs. + * b) A user can additionally specify a sampling method based on allele frequency. Two sampling methods are currently supported. + * 1. Uniform sampling will just sample uniformly from variants polymorphic in selected samples. + * 2. Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set. + * + * User can additionally restrict output to a particular type of variant (SNP, Indel, etc.) + * + *

Input

+ *

+ * One or more variant sets to choose from. + *

+ * + *

Output

+ *

+ * A sites-only VCF with the desired number of randomly selected sites. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidationSiteSelectorWalker \
+ *   --variant input1.vcf \
+ *   --variant input2.vcf \
+ *   -sn NA12878 \
+ *   -o output.vcf \
+ *   --numValidationSites 200   \
+ *   -sampleMode  POLY_BASED_ON_GT \
+ *   -freqMode KEEP_AF_SPECTRUM
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidationSiteSelectorWalker \
+ *   --variant:foo input1.vcf \
+ *   --variant:bar input2.vcf \
+ *   --numValidationSites 200 \
+ *   -sf samples.txt \
+ *   -o output.vcf \
+ *   -sampleMode  POLY_BASED_ON_GT \
+  *   -freqMode UNIFORM
+ *   -selectType INDEL
+ * 
+ * + */ +public class ValidationSiteSelectorWalker extends RodWalker { + + public enum AF_COMPUTATION_MODE { + KEEP_AF_SPECTRUM, + UNIFORM + } + + public enum SAMPLE_SELECTION_MODE { + NONE, + POLY_BASED_ON_GT, + POLY_BASED_ON_GL + } + + @Input(fullName="variant", shortName = "V", doc="Input VCF file, can be specified multiple times", required=true) + public List> variants; + + @Output(doc="File to which variants should be written",required=true) + protected VCFWriter vcfWriter = null; + + @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) + public Set sampleNames = new HashSet(0); + + @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) + public Set sampleExpressions ; + + @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) + public Set sampleFiles; + + @Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false) + private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE; + + @Argument(shortName="samplePNonref",fullName="samplePNonref", doc="GL-based selection mode only: the probability" + + " that a site is non-reference in the samples for which to include the site",required=false) + private double samplePNonref = 0.99; + + @Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true) + private int numValidationSites; + + @Argument(fullName="includeFilteredSites", shortName="ifs", doc="If true, will include filtered sites in set to choose variants from", required=false) + private boolean INCLUDE_FILTERED_SITES = false; + + @Argument(fullName="ignoreGenotypes", shortName="ignoreGenotypes", doc="If true, will ignore genotypes in VCF, will take AC,AF from annotations and will make no sample selection", required=false) + private boolean IGNORE_GENOTYPES = false; + + @Argument(fullName="ignorePolymorphicStatus", shortName="ignorePolymorphicStatus", doc="If true, will ignore polymorphic status in VCF, and will take VCF record directly without pre-selection", required=false) + private boolean IGNORE_POLYMORPHIC = false; + + @Hidden + @Argument(fullName="numFrequencyBins", shortName="numBins", doc="Number of frequency bins if we're to match AF distribution", required=false) + private int numFrequencyBins = 20; + + /** + * This argument selects allele frequency selection mode: + * KEEP_AF_SPECTRUM will choose variants so that the resulting allele frequency spectrum matches as closely as possible the input set + * UNIFORM will choose variants uniformly without regard to their allele frequency. + * + */ + @Argument(fullName="frequencySelectionMode", shortName="freqMode", doc="Allele Frequency selection mode", required=false) + private AF_COMPUTATION_MODE freqMode = AF_COMPUTATION_MODE.KEEP_AF_SPECTRUM; + + /** + * This argument selects particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. + * When specified one or more times, a particular type of variant is selected. + * + */ + @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) + private List TYPES_TO_INCLUDE = new ArrayList(); + + + private TreeSet samples = new TreeSet(); + SampleSelector sampleSelector = null; + FrequencyModeSelector frequencyModeSelector = null; + private ArrayList selectedTypes = new ArrayList(); + + public void initialize() { + // Get list of samples to include in the output + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit()); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); + Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); + + // first, add any requested samples + samples.addAll(samplesFromFile); + samples.addAll(samplesFromExpressions); + samples.addAll(sampleNames); + + // if none were requested, we want all of them + if ( samples.isEmpty() ) { + samples.addAll(vcfSamples); + + } + + sampleSelector = getSampleSelectorObject(sampleMode, samples); + + // initialize frequency mode selector + frequencyModeSelector = getFrequencyModeSelectorObject(freqMode, getToolkit().getGenomeLocParser()); + + // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include + if (TYPES_TO_INCLUDE.isEmpty()) { + + for (VariantContext.Type t : VariantContext.Type.values()) + selectedTypes.add(t); + + } + else { + for (VariantContext.Type t : TYPES_TO_INCLUDE) + selectedTypes.add(t); + + } + + Set headerLines = new HashSet(); + headerLines.add(new VCFHeaderLine("source", "ValidationSiteSelector")); + vcfWriter.writeHeader(new VCFHeader(headerLines)); + + } + + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + Collection vcs = tracker.getValues(variants, context.getLocation()); + + if ( vcs == null || vcs.size() == 0) { + return 0; + } + + + for (VariantContext vc : vcs) { + if (!selectedTypes.contains(vc.getType())) + continue; + + // skip if site isn't polymorphic and if user didn't request to ignore polymorphic status + if (!vc.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + continue; + + if (!INCLUDE_FILTERED_SITES && vc.filtersWereApplied() && vc.isFiltered()) + continue; + + + // does this site pass the criteria for the samples we are interested in? + boolean passesSampleSelectionCriteria; + if (samples.isEmpty()) + passesSampleSelectionCriteria = true; + else + passesSampleSelectionCriteria = sampleSelector.selectSiteInSamples(vc); + + frequencyModeSelector.logCurrentSiteData(vc,passesSampleSelectionCriteria,IGNORE_GENOTYPES,IGNORE_POLYMORPHIC); + } + return 1; + } + + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer value, Integer sum) { return value + sum; } + + public void onTraversalDone(Integer result) { + logger.info("Outputting validation sites..."); + ArrayList selectedSites = frequencyModeSelector.selectValidationSites(numValidationSites); + + for (VariantContext vc : selectedSites) { + vcfWriter.add(vc); + } + logger.info(result + " records processed."); + + } + + private SampleSelector getSampleSelectorObject(SAMPLE_SELECTION_MODE sampleMode, TreeSet samples) { + SampleSelector sm; + switch ( sampleMode ) { + case POLY_BASED_ON_GL: + sm = new GLBasedSampleSelector(samples, Math.log10(1.0-samplePNonref)); + break; + case POLY_BASED_ON_GT: + sm = new GTBasedSampleSelector(samples); + break; + case NONE: + sm = new NullSampleSelector(samples); + break; + default: + throw new IllegalArgumentException("Unsupported Sample Selection Mode: " + sampleMode); + } + + return sm; + } + + private FrequencyModeSelector getFrequencyModeSelectorObject (AF_COMPUTATION_MODE freqMode, GenomeLocParser parser) { + FrequencyModeSelector fm; + + switch (freqMode) { + case KEEP_AF_SPECTRUM: + fm = new KeepAFSpectrumFrequencySelector(numFrequencyBins, parser); + break; + case UNIFORM: + fm = new UniformSamplingFrequencySelector(parser); + break; + default: throw new IllegalArgumentException("Unexpected Frequency Selection Mode: "+ freqMode); + + } + return fm; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 28f4f2a56a..74291e025b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -1,9 +1,12 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.util.IntervalTree; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -15,19 +18,24 @@ import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; @@ -143,10 +151,10 @@ public class VariantEvalWalker extends RodWalker implements Tr /** * See the -list argument to view available modules. */ - @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) + @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required=false) protected String[] MODULES_TO_USE = {}; - @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false) + @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false) protected Boolean NO_STANDARD_MODULES = false; // Other arguments @@ -156,13 +164,7 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false) protected double MIN_PHASE_QUALITY = 10.0; - /** - * This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined. - */ - @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false) - protected String FAMILY_STRUCTURE; - - @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) + @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) @@ -171,6 +173,26 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) private boolean requireStrictAlleleMatch = false; + /** + * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying + * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc. + */ + @Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false) + public boolean mergeEvals = false; + + /** + * File containing tribble-readable features for the IntervalStratificiation + */ + @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false) + public IntervalBinding intervalsFile = null; + + /** + * File containing tribble-readable features containing known CNVs. For use with VariantSummary table. + */ + @Input(fullName="knownCNVs", shortName="knownCNVs", doc="File containing tribble-readable features describing a known list of copy number variants", required=false) + public IntervalBinding knownCNVsFile = null; + Map> knownCNVsByContig = Collections.emptyMap(); + // Variables private Set jexlExpressions = new TreeSet(); @@ -224,13 +246,8 @@ public void initialize() { knowns.add(compRod); } - // Collect the eval rod names - Set evalNames = new TreeSet(); - for ( RodBinding evalRod : evals ) - evalNames.add(evalRod.getName()); - // Now that we have all the rods categorized, determine the sample list from the eval rods. - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames); + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Load the sample list @@ -252,12 +269,22 @@ public void initialize() { stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); for ( VariantStratifier vs : getStratificationObjects() ) { - if ( vs.getClass().getSimpleName().equals("Filter") ) + if ( vs.getName().equals("Filter") ) byFilterIsEnabled = true; - else if ( vs.getClass().getSimpleName().equals("Sample") ) + else if ( vs.getName().equals("Sample") ) perSampleIsEnabled = true; } + if ( intervalsFile != null ) { + boolean fail = true; + for ( final VariantStratifier vs : stratificationObjects ) { + if ( vs.getClass().equals(IntervalStratification.class) ) + fail = false; + } + if ( fail ) + throw new UserException.BadArgumentValue("ST", "stratIntervals argument provided but -ST IntervalStratification not provided"); + } + // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); @@ -272,6 +299,28 @@ else if ( vs.getClass().getSimpleName().equals("Sample") ) throw new ReviewedStingException(String.format("The ancestral alignments file, '%s', could not be found", ancestralAlignmentsFile.getAbsolutePath())); } } + + + // initialize CNVs + if ( knownCNVsFile != null ) { + knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile); + } + } + + public final Map> createIntervalTreeByContig(final IntervalBinding intervals) { + final Map> byContig = new HashMap>(); + + final List locs = intervals.getIntervals(getToolkit()); + + // set up the map from contig -> interval tree + for ( final String contig : getContigNames() ) + byContig.put(contig, new IntervalTree()); + + for ( final GenomeLoc loc : locs ) { + byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc); + } + + return byContig; } /** @@ -289,16 +338,17 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); // --------- track --------- sample - VariantContexts - - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled); - HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false); + HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); + HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for ( final RodBinding evalRod : evals ) { - final HashMap> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : new HashMap>(0); + final Map> emptyEvalMap = Collections.emptyMap(); + final Map> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap; // for each sample stratifier for ( final String sampleName : sampleNamesForStratification ) { - Set evalSetBySample = evalSet.get(sampleName); + Collection evalSetBySample = evalSet.get(sampleName); if ( evalSetBySample == null ) { evalSetBySample = new HashSet(1); evalSetBySample.add(null); @@ -308,16 +358,14 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo for ( VariantContext eval : evalSetBySample ) { // deal with ancestral alleles if requested if ( eval != null && aastr != null ) { - HashMap newAts = new HashMap(eval.getAttributes()); - newAts.put("ANCESTRALALLELE", aastr); - eval = VariantContext.modifyAttributes(eval, newAts); + eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make(); } // for each comp track for ( final RodBinding compRod : comps ) { // no sample stratification for comps - final HashMap> compSetHash = compVCs.get(compRod); - final Set compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet(0) : compVCs.get(compRod).values().iterator().next(); + final HashMap> compSetHash = compVCs.get(compRod); + final Collection compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.emptyList() : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); @@ -353,13 +401,15 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo } } } + + if ( mergeEvals ) break; // stop processing the eval tracks } } return null; } - private VariantContext findMatchingComp(final VariantContext eval, final Set comps) { + private VariantContext findMatchingComp(final VariantContext eval, final Collection comps) { // if no comps, return null if ( comps == null || comps.isEmpty() ) return null; @@ -424,20 +474,18 @@ public void onTraversalDone(Integer result) { TableType t = (TableType) field.get(ve); String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); - String subTableDesc = datamap.get(field).description(); + final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; if (!report.hasTable(subTableName)) { - report.addTable(subTableName, subTableDesc); + report.addTable(subTableName, dataPointAnn.description()); table = report.getTable(subTableName); table.addPrimaryKey("entry", false); table.addColumn(subTableName, subTableName); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getClass().getSimpleName(); - - table.addColumn(columnName, "unknown"); + table.addColumn(vs.getName(), "unknown"); } table.addColumn("row", "unknown"); @@ -461,9 +509,8 @@ public void onTraversalDone(Integer result) { String r = (String) t.getRowKeys()[row]; for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getClass().getSimpleName(); - - table.set(stateKey.toString() + r, columnName, stateKey.get(vs.getClass().getSimpleName())); + final String columnName = vs.getName(); + table.set(stateKey.toString() + r, columnName, stateKey.get(columnName)); } for (int col = 0; col < t.getColumnKeys().length; col++) { @@ -484,9 +531,9 @@ public void onTraversalDone(Integer result) { GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getClass().getSimpleName(); + String columnName = vs.getName(); - table.set(stateKey.toString(), columnName, stateKey.get(vs.getClass().getSimpleName())); + table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); } table.set(stateKey.toString(), field.getName(), field.get(ve)); @@ -508,8 +555,6 @@ public void onTraversalDone(Integer result) { public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; } - public String getFamilyStructure() { return FAMILY_STRUCTURE; } - public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; } public TreeSet getStratificationObjects() { return stratificationObjects; } @@ -536,4 +581,11 @@ public Set getContigNames() { return contigs; } + public GenomeLocParser getGenomeLocParser() { + return getToolkit().getGenomeLocParser(); + } + + public GenomeAnalysisEngine getToolkit() { + return super.getToolkit(); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java deleted file mode 100755 index 925bff9c09..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -class NewCompEvalGenotypes { - private GenomeLoc loc; - private Genotype compGt; - private Genotype evalGt; - - public NewCompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { - this.loc = loc; - this.compGt = compGt; - this.evalGt = evalGt; - } - - public GenomeLoc getLocus() { - return loc; - } - - public Genotype getCompGenotpye() { - return compGt; - } - public Genotype getEvalGenotype() { - return evalGt; - } - - public void setCompGenotype(Genotype compGt) { - this.compGt = compGt; - } - - public void setEvalGenotype(Genotype evalGt) { - this.evalGt = evalGt; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 9facb11b58..89d137ea98 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -28,13 +28,13 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { @DataPoint(description = "number of eval sites at comp sites") long nVariantsAtComp = 0; - @DataPoint(description = "percentage of eval sites at comp sites") + @DataPoint(description = "percentage of eval sites at comp sites", format = "%.2f" ) double compRate = 0.0; @DataPoint(description = "number of concordant sites") long nConcordant = 0; - @DataPoint(description = "the concordance rate") + @DataPoint(description = "the concordance rate", format = "%.2f") double concordantRate = 0.0; public int getComparisonOrder() { @@ -72,7 +72,7 @@ public boolean discordantP(VariantContext eval, VariantContext comp) { } public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean evalIsGood = eval != null && eval.isPolymorphic(); + boolean evalIsGood = eval != null && eval.isPolymorphicInSamples(); boolean compIsGood = comp != null && comp.isNotFiltered(); if (evalIsGood) nEvalVariants++; // count the number of eval events diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index e834340370..e5e8dfaf56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -41,6 +41,9 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public long nDeletions = 0; @DataPoint(description = "Number of complex indels") public long nComplex = 0; + @DataPoint(description = "Number of symbolic events") + public long nSymbolic = 0; + @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)") public long nMixed = 0; @@ -59,17 +62,17 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public long nHomDerived = 0; // calculations that get set in the finalizeEvaluation method - @DataPoint(description = "heterozygosity per locus rate") + @DataPoint(description = "heterozygosity per locus rate", format = "%.2e") public double heterozygosity = 0; - @DataPoint(description = "heterozygosity per base pair") + @DataPoint(description = "heterozygosity per base pair", format = "%.2f") public double heterozygosityPerBp = 0; - @DataPoint(description = "heterozygosity to homozygosity ratio") + @DataPoint(description = "heterozygosity to homozygosity ratio", format = "%.2f") public double hetHomRatio = 0; - @DataPoint(description = "indel rate (insertion count + deletion count)") + @DataPoint(description = "indel rate (insertion count + deletion count)", format = "%.2e") public double indelRate = 0; - @DataPoint(description = "indel rate per base pair") + @DataPoint(description = "indel rate per base pair", format = "%.2f") public double indelRatePerBp = 0; - @DataPoint(description = "deletion to insertion ratio") + @DataPoint(description = "deletion to insertion ratio", format = "%.2f") public double deletionInsertionRatio = 0; private double perLocusRate(long n) { @@ -100,7 +103,7 @@ public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceC // So in order to maintain consistency with the previous implementation (and the intention of the original author), I've // added in a proxy check for monomorphic status here. // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call. - if ( vc1.isMonomorphic() ) { + if ( vc1.isMonomorphicInSamples() ) { nRefLoci++; } else { switch (vc1.getType()) { @@ -131,8 +134,7 @@ else if (vc1.isSimpleDeletion()) nMixed++; break; case SYMBOLIC: - // ignore symbolic alleles, but don't fail - // todo - consistent way of treating symbolic alleles thgoughout codebase? + nSymbolic++; break; default: throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); @@ -155,8 +157,8 @@ else if (vc1.isSimpleDeletion()) // A C A // A C C - for (Genotype g : vc1.getGenotypes().values()) { - String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null; + for (final Genotype g : vc1.getGenotypes()) { + final String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null; switch (g.getType()) { case NO_CALL: @@ -180,6 +182,8 @@ else if (vc1.isSimpleDeletion()) nHomDerived++; } + break; + case MIXED: break; default: throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java index bbd3f5f548..4f5aeed61e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java @@ -209,7 +209,7 @@ public void organizeHistogramTables() { //public GenotypeConcordance(VariantEvalWalker parent) { // super(parent); - // discordantInteresting = parent.DISCORDANT_INTERESTING; + // discordantInteresting = parent.DISCORDANT_INTERESTING; //} public String getName() { @@ -277,8 +277,9 @@ private String determineStats(final VariantContext eval, final VariantContext va // determine concordance for eval data if (eval != null) { - for (final String sample : eval.getGenotypes().keySet()) { - final Genotype.Type called = eval.getGenotype(sample).getType(); + for (final Genotype g : eval.getGenotypes() ) { + final String sample = g.getSampleName(); + final Genotype.Type called = g.getType(); final Genotype.Type truth; if (!validationIsValidVC || !validation.hasGenotype(sample)) { @@ -299,9 +300,9 @@ private String determineStats(final VariantContext eval, final VariantContext va else { final Genotype.Type called = Genotype.Type.NO_CALL; - for (final String sample : validation.getGenotypes().keySet()) { - final Genotype.Type truth = validation.getGenotype(sample).getType(); - detailedStats.incrValue(sample, truth, called); + for (final Genotype g : validation.getGenotypes()) { + final Genotype.Type truth = g.getType(); + detailedStats.incrValue(g.getSampleName(), truth, called); // print out interesting sites /* @@ -410,8 +411,8 @@ public Object[] getColumnKeys() { public SampleStats(VariantContext vc, int nGenotypeTypes) { this.nGenotypeTypes = nGenotypeTypes; - for (String sample : vc.getGenotypes().keySet()) - concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]); + for (final Genotype g : vc.getGenotypes()) + concordanceStats.put(g.getSampleName(), new long[nGenotypeTypes][nGenotypeTypes]); } public SampleStats(int genotypeTypes) { @@ -444,39 +445,6 @@ public String getName() { } } -/** - * Sample stats, but for AC - */ -class ACStats extends SampleStats { - private String[] rowKeys; - - public ACStats(VariantContext evalvc, VariantContext compvc, int nGenotypeTypes) { - super(nGenotypeTypes); - rowKeys = new String[1+2*evalvc.getGenotypes().size()+1+2*compvc.getGenotypes().size()]; - for ( int i = 0; i <= 2*evalvc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... - concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[i] = String.format("evalAC%d",i); - - } - - for ( int i = 0; i <= 2*compvc.getGenotypes().size(); i++ ) { - concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[1+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - } - - public String getName() { - return "Allele Count Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null ) { - throw new StingException("RowKeys is null!"); - } - return rowKeys; - } -} - /** * a table of sample names to genotype concordance summary statistics */ @@ -511,8 +479,8 @@ public Object[] getColumnKeys() { public SampleSummaryStats(final VariantContext vc) { concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - concordanceSummary.put(sample, new double[COLUMN_KEYS.length]); + for( final Genotype g : vc.getGenotypes() ) { + concordanceSummary.put(g.getSampleName(), new double[COLUMN_KEYS.length]); } } @@ -636,79 +604,3 @@ public String getName() { } } -/** - * SampleSummaryStats .. but for allele counts - */ -class ACSummaryStats extends SampleSummaryStats { - private String[] rowKeys; - - public ACSummaryStats (final VariantContext evalvc, final VariantContext compvc) { - concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - rowKeys = new String[3+2*evalvc.getGenotypes().size() + 2*compvc.getGenotypes().size()]; - rowKeys[0] = ALL_SAMPLES_KEY; - for( int i = 0; i <= 2*evalvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[i+1] = String.format("evalAC%d",i); - } - for( int i = 0; i <= 2*compvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[2+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - - } - - public String getName() { - return "Allele Count Summary Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null) { - throw new StingException("rowKeys is null!!"); - } - return rowKeys; - } -} - -class CompACNames implements Comparator{ - - final Logger myLogger; - private boolean info = true; - - public CompACNames(Logger l) { - myLogger = l; - } - - public boolean equals(Object o) { - return ( o.getClass() == CompACNames.class ); - } - - public int compare(Object o1, Object o2) { - if ( info ) { - myLogger.info("Sorting AC names"); - info = false; - } - //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); - return getRank(o1) - getRank(o2); - } - - public int getRank(Object o) { - if ( o.getClass() != String.class ) { - return Integer.MIN_VALUE/4; - } else { - String s = (String) o; - if ( s.startsWith("eval") ) { - return Integer.MIN_VALUE/4 + 1 + parseAC(s); - } else if ( s.startsWith("comp") ) { - return 1+ parseAC(s); - } else { - return Integer.MIN_VALUE/4; - } - } - } - - public int parseAC(String s) { - String[] g = s.split("AC"); - return Integer.parseInt(g[1]); - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index e69dbfb28e..ea12ada484 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -14,6 +14,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.HashMap; @@ -91,13 +92,13 @@ public String update2(VariantContext eval, VariantContext comp, RefMetaDataTrack Set allSamples = new HashSet(); - Map compSampGenotypes = null; + GenotypesContext compSampGenotypes = null; if (isRelevantToPhasing(comp)) { allSamples.addAll(comp.getSampleNames()); compSampGenotypes = comp.getGenotypes(); } - Map evalSampGenotypes = null; + GenotypesContext evalSampGenotypes = null; if (isRelevantToPhasing(eval)) { allSamples.addAll(eval.getSampleNames()); evalSampGenotypes = eval.getGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java index ffe7c185fd..ccec9af126 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -91,7 +91,7 @@ private int len2index(int len) { public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( vc1.isIndel() && vc1.isPolymorphic() ) { + if ( vc1.isIndel() && vc1.isPolymorphicInSamples() ) { if ( ! vc1.isBiallelic() ) { //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index f70e6c2de4..87b453ae38 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -8,11 +8,9 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; import org.broadinstitute.sting.utils.IndelUtils; -import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.HashMap; /* * Copyright (c) 2010 The Broad Institute @@ -270,7 +268,7 @@ public String toString() { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (eval != null && eval.isPolymorphic()) { + if (eval != null && eval.isPolymorphicInSamples()) { if ( indelStats == null ) { indelStats = new IndelStats(eval); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java index a0cc393d92..363f5665fe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -7,9 +9,11 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Map; +import java.util.Set; /** * Mendelian violation detection and counting @@ -40,12 +44,25 @@ @Analysis(name = "Mendelian Violation Evaluator", description = "Mendelian Violation Evaluator") public class MendelianViolationEvaluator extends VariantEvaluator { - @DataPoint(description = "Number of mendelian variants found") + @DataPoint(description = "Number of variants found with at least one family having genotypes") long nVariants; + @DataPoint(description = "Number of variants found with no family having genotypes -- these sites do not count in the nNoCall") + long nSkipped; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual)") + long nFamCalled; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual) that contain at least one var allele.") + long nVarFamCalled; + @DataPoint(description="Number of variants x families discarded as low quality") + long nLowQual; + @DataPoint(description="Number of variants x families discarded as no call") + long nNoCall; + @DataPoint(description="Number of loci with mendelian violations") + long nLociViolations; @DataPoint(description = "Number of mendelian violations found") long nViolations; - @DataPoint(description = "number of child hom ref calls where the parent was hom variant") + + /*@DataPoint(description = "number of child hom ref calls where the parent was hom variant") long KidHomRef_ParentHomVar; @DataPoint(description = "number of child het calls where the parent was hom ref") long KidHet_ParentsHomRef; @@ -53,11 +70,65 @@ public class MendelianViolationEvaluator extends VariantEvaluator { long KidHet_ParentsHomVar; @DataPoint(description = "number of child hom variant calls where the parent was hom ref") long KidHomVar_ParentHomRef; + */ + + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HOM_VAR") + long mvRefRef_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HET") + long mvRefRef_Het; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HET -> HOM_VAR") + long mvRefHet_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_VAR") + long mvRefVar_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_REF") + long mvRefVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HET -> HOM_REF") + long mvVarHet_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HOM_REF") + long mvVarVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HET") + long mvVarVar_Het; + + + /*@DataPoint(description ="Number of inherited var alleles from het parents") + long nInheritedVar; + @DataPoint(description ="Number of inherited ref alleles from het parents") + long nInheritedRef;*/ + + @DataPoint(description="Number of HomRef/HomRef/HomRef trios") + long HomRefHomRef_HomRef; + @DataPoint(description="Number of Het/Het/Het trios") + long HetHet_Het; + @DataPoint(description="Number of Het/Het/HomRef trios") + long HetHet_HomRef; + @DataPoint(description="Number of Het/Het/HomVar trios") + long HetHet_HomVar; + @DataPoint(description="Number of HomVar/HomVar/HomVar trios") + long HomVarHomVar_HomVar; + @DataPoint(description="Number of HomRef/HomVar/Het trios") + long HomRefHomVAR_Het; + @DataPoint(description="Number of ref alleles inherited from het/het parents") + long HetHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from het/het parents") + long HetHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homRef/het parents") + long HomRefHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homRef/het parents") + long HomRefHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homVar/het parents") + long HomVarHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homVar/het parents") + long HomVarHet_inheritedVar; MendelianViolation mv; + PrintStream mvFile; + Map> families; public void initialize(VariantEvalWalker walker) { - mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold()); + //Changed by Laurent Francioli - 2011-06-07 + //mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold()); + mv = new MendelianViolation(walker.getMendelianViolationQualThreshold(),false); + families = walker.getSampleDB().getFamilies(); } public boolean enabled() { @@ -75,110 +146,48 @@ public int getComparisonOrder() { public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (vc.isBiallelic() && vc.hasGenotypes()) { // todo -- currently limited to biallelic loci - if (mv.setAlleles(vc)) { - nVariants++; - - Genotype momG = vc.getGenotype(mv.getSampleMom()); - Genotype dadG = vc.getGenotype(mv.getSampleDad()); - Genotype childG = vc.getGenotype(mv.getSampleChild()); - - if (mv.isViolation()) { - nViolations++; - - String label; - if (childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar())) { - label = "KidHomRef_ParentHomVar"; - KidHomRef_ParentHomVar++; - } else if (childG.isHet() && (momG.isHomRef() && dadG.isHomRef())) { - label = "KidHet_ParentsHomRef"; - KidHet_ParentsHomRef++; - } else if (childG.isHet() && (momG.isHomVar() && dadG.isHomVar())) { - label = "KidHet_ParentsHomVar"; - KidHet_ParentsHomVar++; - } else if (childG.isHomVar() && (momG.isHomRef() || dadG.isHomRef())) { - label = "KidHomVar_ParentHomRef"; - KidHomVar_ParentHomRef++; - } else { - throw new ReviewedStingException("BUG: unexpected child genotype class " + childG); - } - - return "MendelViolation=" + label; - } - } - } - - return null; // we don't capture any intersting sites - } - - -/* - private double getQThreshold() { - //return getVEWalker().MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred - return mendelianViolationQualThreshold / 10; // we aren't 10x scaled in the GATK a la phred - //return 0.0; - } - - TrioStructure trio; - double mendelianViolationQualThreshold; - - private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - - public static class TrioStructure { - public String mom, dad, child; - } - - public static TrioStructure parseTrioDescription(String family) { - Matcher m = FAMILY_PATTERN.matcher(family); - if (m.matches()) { - TrioStructure trio = new TrioStructure(); - //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); - trio.mom = m.group(1); - trio.dad = m.group(2); - trio.child = m.group(3); - return trio; - } else { - throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); - } - } - public void initialize(VariantEvalWalker walker) { - trio = parseTrioDescription(walker.getFamilyStructure()); - mendelianViolationQualThreshold = walker.getMendelianViolationQualThreshold(); - } + if(mv.countViolations(families,vc)>0){ + nLociViolations++; + nViolations += mv.getViolationsCount(); + mvRefRef_Var += mv.getParentsRefRefChildVar(); + mvRefRef_Het += mv.getParentsRefRefChildHet(); + mvRefHet_Var += mv.getParentsRefHetChildVar(); + mvRefVar_Var += mv.getParentsRefVarChildVar(); + mvRefVar_Ref += mv.getParentsRefVarChildRef(); + mvVarHet_Ref += mv.getParentsVarHetChildRef(); + mvVarVar_Ref += mv.getParentsVarVarChildRef(); + mvVarVar_Het += mv.getParentsVarVarChildHet(); - private boolean includeGenotype(Genotype g) { - return g.getNegLog10PError() > getQThreshold() && g.isCalled(); - } - - public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) { - return isViolation(vc, momG.getAlleles(), dadG.getAlleles(), childG.getAlleles()); - } + } + HomRefHomRef_HomRef += mv.getRefRefRef(); + HetHet_Het += mv.getHetHetHet(); + HetHet_HomRef += mv.getHetHetHomRef(); + HetHet_HomVar += mv.getHetHetHomVar(); + HomVarHomVar_HomVar += mv.getVarVarVar(); + HomRefHomVAR_Het += mv.getRefVarHet(); + HetHet_inheritedRef += mv.getParentsHetHetInheritedRef(); + HetHet_inheritedVar += mv.getParentsHetHetInheritedVar(); + HomRefHet_inheritedRef += mv.getParentsRefHetInheritedRef(); + HomRefHet_inheritedVar += mv.getParentsRefHetInheritedVar(); + HomVarHet_inheritedRef += mv.getParentsVarHetInheritedRef(); + HomVarHet_inheritedVar += mv.getParentsVarHetInheritedVar(); + + if(mv.getFamilyCalledCount()>0){ + nVariants++; + nFamCalled += mv.getFamilyCalledCount(); + nLowQual += mv.getFamilyLowQualsCount(); + nNoCall += mv.getFamilyNoCallCount(); + nVarFamCalled += mv.getVarFamilyCalledCount(); + } + else{ + nSkipped++; + } - public static boolean isViolation(VariantContext vc, TrioStructure trio ) { - return isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child) ); - } - public static boolean isViolation(VariantContext vc, List momA, List dadA, List childA) { - //VariantContext momVC = vc.subContextFromGenotypes(momG); - //VariantContext dadVC = vc.subContextFromGenotypes(dadG); - int i = 0; - Genotype childG = new Genotype("kidG", childA); - for (Allele momAllele : momA) { - for (Allele dadAllele : dadA) { - if (momAllele.isCalled() && dadAllele.isCalled()) { - Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); - if (childG.sameGenotype(possibleChild)) { - return false; - } - } - } + return null; } - return true; + return null; // we don't capture any interesting sites } - - -*/ - - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java deleted file mode 100755 index 2d01632063..0000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ /dev/null @@ -1,194 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.Degeneracy; -import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.Sample; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.StateKey; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.ArrayList; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author depristo - * @since Apr 11, 2010 - */ - -@Analysis(name = "Quality Metrics by allele count", description = "Shows various stats binned by allele count") -public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval { - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(description = "TiTv by allele count") - MetricsByAc metrics = null; - - private final static Object[] METRIC_COLUMNS = {"AC", "nTi", "nTv", "n", "TiTv"}; - private int numSamples; - - class MetricsAtAC { - public int ac = -1, nTi = 0, nTv = 0; - - public MetricsAtAC(int ac) { this.ac = ac; } - - public void update(VariantContext eval) { - if ( VariantContextUtils.isTransition(eval) ) - nTi++; - else - nTv++; - } - - // corresponding to METRIC_COLUMNS - public String getColumn(int i) { - switch (i) { - case 0: return String.valueOf(ac); - case 1: return String.valueOf(nTi); - case 2: return String.valueOf(nTv); - case 3: return String.valueOf(nTi + nTv); - case 4: return String.valueOf(ratio(nTi, nTv)); - default: - throw new ReviewedStingException("Unexpected column " + i); - } - } - } - - class MetricsByAc implements TableType { - ArrayList metrics = new ArrayList(); - Object[] rows = null; - - public MetricsByAc( int nchromosomes ) { - rows = new Object[nchromosomes+1]; - metrics = new ArrayList(nchromosomes+1); - for ( int i = 0; i < nchromosomes + 1; i++ ) { - metrics.add(new MetricsAtAC(i)); - rows[i] = "ac" + i; - } - } - - public Object[] getRowKeys() { - return rows; - } - - public Object[] getColumnKeys() { - return METRIC_COLUMNS; - } - - public String getName() { - return "MetricsByAc"; - } - - public String getCell(int ac, int y) { - return metrics.get(ac).getColumn(y); - } - - public String toString() { - return ""; - } - - public void incrValue( VariantContext eval ) { - int ac = -1; - - if ( eval.hasGenotypes() ) - ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); - else if ( eval.hasAttribute("AC") ) { - ac = eval.getAttributeAsInt("AC", -1); - } - - if ( ac != -1 ) { - metrics.get(ac).update(eval); - } - } - } - - public void initialize(VariantEvalWalker walker) { - numSamples = walker.getNumSamples(); - metrics = new MetricsByAc(2*numSamples); - } - - public String getName() { - return "SimpleMetricsByAC"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (numSamples == 0) { - return null; - } - - final String interesting = null; - - if (eval != null) { - if ( metrics == null ) { - int nSamples = numSamples; - - if ( nSamples != -1 ) { - metrics = new MetricsByAc(2 * nSamples); - } - } - - if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) { - metrics.incrValue(eval); - } - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - @Override - public boolean stateIsApplicable(StateKey stateKey) { - String sampleClassName = Sample.class.getSimpleName(); - String degeneracyClassName = Degeneracy.class.getSimpleName(); - - //return !(stateKey.containsKey(sampleClassName) && !stateKey.get(sampleClassName).equalsIgnoreCase("all")); - - if (stateKey.containsKey(sampleClassName) && !stateKey.get(sampleClassName).equalsIgnoreCase("all")) { - return false; - } - - if (stateKey.containsKey(degeneracyClassName) && !stateKey.get(degeneracyClassName).equalsIgnoreCase("all")) { - return false; - } - - return true; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java index e51623c3cb..bb7843361b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java @@ -37,7 +37,7 @@ public int getComparisonOrder() { } public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) { + if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphicInSamples()) { return null; //no interesting sites } @@ -48,7 +48,7 @@ public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceCo float numGenosHere = 0; int numIndsHere = 0; - for (Genotype genotype : vc.getGenotypes().values()) { + for (final Genotype genotype : vc.getGenotypes()) { numIndsHere++; if (!genotype.isNoCall()) { //increment stats for heterozygosity diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 9b6e145e6a..9de850d82a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -16,19 +16,19 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv long nTi = 0; @DataPoint(description = "number of transversion loci") long nTv = 0; - @DataPoint(description = "the transition to transversion ratio") + @DataPoint(description = "the transition to transversion ratio", format = "%.2f") double tiTvRatio = 0.0; @DataPoint(description = "number of comp transition sites") long nTiInComp = 0; @DataPoint(description = "number of comp transversion sites") long nTvInComp = 0; - @DataPoint(description = "the transition to transversion ratio for comp sites") + @DataPoint(description = "the transition to transversion ratio for comp sites", format = "%.2f") double TiTvRatioStandard = 0.0; @DataPoint(description = "number of derived transition loci") long nTiDerived = 0; @DataPoint(description = "number of derived transversion loci") long nTvDerived = 0; - @DataPoint(description = "the derived transition to transversion ratio") + @DataPoint(description = "the derived transition to transversion ratio", format = "%.2f") double tiTvDerivedRatio = 0.0; public boolean enabled() { @@ -40,7 +40,7 @@ public int getComparisonOrder() { } public void updateTiTv(VariantContext vc, boolean updateStandard) { - if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) { + if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphicInSamples()) { if (VariantContextUtils.isTransition(vc)) { if (updateStandard) nTiInComp++; else nTi++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 3b4967cad7..86d3467fb8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -11,7 +11,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Collection; -import java.util.Set; /** * The Broad Institute @@ -31,10 +30,10 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { @DataPoint(description = "FN") int FN = 0; @DataPoint(description = "TN") int TN = 0; - @DataPoint(description = "Sensitivity") double sensitivity = 0; - @DataPoint(description = "Specificity") double specificity = 0; - @DataPoint(description = "PPV") double PPV = 0; - @DataPoint(description = "FDR") double FDR = 0; + @DataPoint(description = "Sensitivity", format = "%.2f") double sensitivity = 0; + @DataPoint(description = "Specificity", format = "%.2f") double specificity = 0; + @DataPoint(description = "PPV", format = "%.2f") double PPV = 0; + @DataPoint(description = "FDR", format = "%.2f") double FDR = 0; @DataPoint(description = "CompMonoEvalNoCall") int CompMonoEvalNoCall = 0; @DataPoint(description = "CompMonoEvalFiltered") int CompMonoEvalFiltered = 0; @@ -118,8 +117,8 @@ public String update2(VariantContext eval, VariantContext comp, RefMetaDataTrack public SiteStatus calcSiteStatus(VariantContext vc) { if ( vc == null ) return SiteStatus.NO_CALL; if ( vc.isFiltered() ) return SiteStatus.FILTERED; - if ( vc.isMonomorphic() ) return SiteStatus.MONO; - if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes + if ( vc.isMonomorphicInSamples() ) return SiteStatus.MONO; + if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphicInSamples was false and there are genotypes if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { int ac = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java index 2632279380..ce9e45c9b6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java @@ -232,14 +232,14 @@ public String toString() { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { final String interesting = null; - if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) + if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) if( titvStats == null ) { titvStats = new TiTvStats(); } titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } int alternateAlleleCount = 0; for (final Allele a : eval.getAlternateAlleles()) { - alternateAlleleCount += eval.getChromosomeCount(a); + alternateAlleleCount += eval.getCalledChrCount(a); } alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java new file mode 100644 index 0000000000..a271d3c35b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import net.sf.picard.util.IntervalTree; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.*; + +@Analysis(description = "1000 Genomes Phase I summary of variants table") +public class VariantSummary extends VariantEvaluator implements StandardEval { + final protected static Logger logger = Logger.getLogger(VariantSummary.class); + + private final static int MAX_INDEL_LENGTH = 50; + private final static double MIN_CNV_OVERLAP = 0.5; + private VariantEvalWalker walker; + + public enum Type { + SNP, INDEL, CNV + } + + Map> knownCNVs = null; + + // basic counts on various rates found + @DataPoint(description = "Number of samples") + public long nSamples = 0; + + @DataPoint(description = "Number of processed loci") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs") + public long nSNPs = 0; + @DataPoint(description = "Overall TiTv ratio", format = "%.2f") + public double TiTvRatio = 0; + @DataPoint(description = "SNP Novelty Rate") + public String SNPNoveltyRate = "NA"; + @DataPoint(description = "Mean number of SNPs per individual") + public long nSNPsPerSample = 0; + @DataPoint(description = "Mean TiTv ratio per individual", format = "%.2f") + public double TiTvRatioPerSample = 0; + @DataPoint(description = "Mean depth of coverage per sample at SNPs", format = "%.1f") + public double SNPDPPerSample = 0; + + @DataPoint(description = "Number of Indels") + public long nIndels = 0; + @DataPoint(description = "Indel Novelty Rate") + public String IndelNoveltyRate = "NA"; + @DataPoint(description = "Mean number of Indels per individual") + public long nIndelsPerSample = 0; + @DataPoint(description = "Mean depth of coverage per sample at Indels", format = "%.1f") + public double IndelDPPerSample = 0; + + @DataPoint(description = "Number of SVs") + public long nSVs = 0; + @DataPoint(description = "SV Novelty Rate") + public String SVNoveltyRate = "NA"; + @DataPoint(description = "Mean number of SVs per individual") + public long nSVsPerSample = 0; + + TypeSampleMap allVariantCounts, knownVariantCounts; + TypeSampleMap countsPerSample; + TypeSampleMap transitionsPerSample, transversionsPerSample; + TypeSampleMap depthPerSample; + + private final static String ALL = "ALL"; + + private class TypeSampleMap extends EnumMap> { + public TypeSampleMap(final Collection samples) { + super(Type.class); + for ( Type type : Type.values() ) { + Map bySample = new HashMap(samples.size()); + for ( final String sample : samples ) { + bySample.put(sample, 0); + } + bySample.put(ALL, 0); + this.put(type, bySample); + } + } + + public final void inc(final Type type, final String sample) { + final int count = this.get(type).get(sample); + get(type).put(sample, count + 1); + } + + public final int all(Type type) { + return get(type).get(ALL); + } + + public final int meanValue(Type type) { + long sum = 0; + int n = 0; + for ( final Map.Entry pair : get(type).entrySet() ) { + if ( pair.getKey() != ALL) { + n++; + sum += pair.getValue(); + } + } + return (int)(Math.round(sum / (1.0 * n))); + } + + public final double ratioValue(Type type, TypeSampleMap denoms, boolean allP) { + double sum = 0; + int n = 0; + for ( final String sample : get(type).keySet() ) { + if ( (allP && sample == ALL) || (!allP && sample != ALL) ) { + final long num = get(type).get(sample); + final long denom = denoms.get(type).get(sample); + sum += ratio(num, denom); + n++; + } + } + + return n > 0 ? sum / (1.0 * n) : 0.0; + } + } + + + public void initialize(VariantEvalWalker walker) { + this.walker = walker; + + nSamples = walker.getSampleNamesForEvaluation().size(); + countsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + transitionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + transversionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + allVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + knownVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + depthPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + + if ( walker.knownCNVsFile != null ) { + knownCNVs = walker.createIntervalTreeByContig(walker.knownCNVsFile); + final List locs = walker.knownCNVsFile.getIntervals(walker.getToolkit()); + logger.info(String.format("Creating known CNV list %s containing %d intervals covering %d bp", + walker.knownCNVsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs))); + } + } + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; // we only need to see each eval track + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + private final Type getType(VariantContext vc) { + switch (vc.getType()) { + case SNP: + return Type.SNP; + case INDEL: + for ( int l : vc.getIndelLengths() ) + if ( Math.abs(l) > MAX_INDEL_LENGTH ) + return Type.CNV; + return Type.INDEL; + case SYMBOLIC: + return Type.CNV; + default: + throw new UserException.BadInput("Unexpected variant context type: " + vc); + } + } + + private final boolean overlapsKnownCNV(VariantContext cnv) { + final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true); + IntervalTree intervalTree = knownCNVs.get(loc.getContig()); + + final Iterator> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); + while ( nodeIt.hasNext() ) { + final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue()); + if ( overlapP > MIN_CNV_OVERLAP ) + return true; + } + + return false; + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || eval.isMonomorphicInSamples() ) return null; + + final Type type = getType(eval); + + TypeSampleMap titvTable = null; + + // update DP, if possible + if ( eval.hasAttribute(VCFConstants.DEPTH_KEY) ) + depthPerSample.inc(type, ALL); + + // update counts + allVariantCounts.inc(type, ALL); + + // type specific calculations + if ( type == Type.SNP ) { + titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample; + titvTable.inc(type, ALL); + } + + // novelty calculation + if ( comp != null || (type == Type.CNV && overlapsKnownCNV(eval))) + knownVariantCounts.inc(type, ALL); + + // per sample metrics + for (final Genotype g : eval.getGenotypes()) { + if ( ! g.isNoCall() && ! g.isHomRef() ) { + countsPerSample.inc(type, g.getSampleName()); + + // update transition / transversion ratio + if ( titvTable != null ) titvTable.inc(type, g.getSampleName()); + + if ( g.hasAttribute(VCFConstants.DEPTH_KEY) ) + depthPerSample.inc(type, g.getSampleName()); + } + } + + return null; // we don't capture any interesting sites + } + + private final String noveltyRate(Type type) { + final int all = allVariantCounts.all(type); + final int known = knownVariantCounts.all(type); + final int novel = all - known; + final double rate = (novel / (1.0 * all)); + return all == 0 ? "NA" : String.format("%.2f", rate); + } + + public void finalizeEvaluation() { + nSNPs = allVariantCounts.all(Type.SNP); + nIndels = allVariantCounts.all(Type.INDEL); + nSVs = allVariantCounts.all(Type.CNV); + + TiTvRatio = transitionsPerSample.ratioValue(Type.SNP, transversionsPerSample, true); + TiTvRatioPerSample = transitionsPerSample.ratioValue(Type.SNP, transversionsPerSample, false); + + nSNPsPerSample = countsPerSample.meanValue(Type.SNP); + nIndelsPerSample = countsPerSample.meanValue(Type.INDEL); + nSVsPerSample = countsPerSample.meanValue(Type.CNV); + + SNPNoveltyRate = noveltyRate(Type.SNP); + IndelNoveltyRate = noveltyRate(Type.INDEL); + SVNoveltyRate = noveltyRate(Type.CNV); + + SNPDPPerSample = depthPerSample.meanValue(Type.SNP); + IndelDPPerSample = depthPerSample.meanValue(Type.INDEL); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index c7bea93b28..2f342e1203 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -47,7 +47,7 @@ public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker t AC = eval.getAttributeAsInt("AC", 0); } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) - AC = Math.max(AC, eval.getChromosomeCount(allele)); + AC = Math.max(AC, eval.getCalledChrCount(allele)); } else // by default, the site is considered monomorphic AC = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java index e276adc324..b2b6d41653 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -15,8 +16,11 @@ public class EvalRod extends VariantStratifier implements RequiredStratification @Override public void initialize() { states = new ArrayList(); - for ( RodBinding rod : getVariantEvalWalker().getEvals() ) + for ( RodBinding rod : getVariantEvalWalker().getEvals() ) { states.add(rod.getName()); + if ( getVariantEvalWalker().mergeEvals ) + break; + } } public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java new file mode 100644 index 0000000000..d91422a7e2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import net.sf.picard.util.IntervalTree; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Stratifies the variants by whether they overlap an interval in the set provided on the command line. + * + * The primary use of this stratification is to provide a mechanism to divide asssessment of a call set up + * by whether a variant overlaps an interval or not. I use this to differentiate between variants occurring + * in CCDS exons vs. those in non-coding regions, in the 1000G call set, using a command line that looks like: + * + * -T VariantEval -R human_g1k_v37.fasta -eval 1000G.vcf -stratIntervals:BED ccds.bed -ST IntervalStratification + * + * Note that the overlap algorithm properly handles symbolic alleles with an INFO field END value. In order to + * safely use this module you should provide entire contigs worth of variants, and let the interval strat decide + * overlap, as opposed to using -L which will not properly work with symbolic variants. + */ +public class IntervalStratification extends VariantStratifier { + final protected static Logger logger = Logger.getLogger(IntervalStratification.class); + Map> intervalTreeByContig = null; + + @Override + public void initialize() { + if ( getVariantEvalWalker().intervalsFile == null ) + throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled"); + + final List locs = getVariantEvalWalker().intervalsFile.getIntervals(getVariantEvalWalker().getToolkit()); + + if ( locs.isEmpty() ) + throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?"); + + intervalTreeByContig = getVariantEvalWalker().createIntervalTreeByContig(getVariantEvalWalker().intervalsFile); + + logger.info(String.format("Creating IntervalStratification %s containing %d intervals covering %d bp", + getVariantEvalWalker().intervalsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs))); + + states = new ArrayList(Arrays.asList("all", "overlaps.intervals", "outside.intervals")); + } + + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + final ArrayList relevantStates = new ArrayList(Arrays.asList("all")); + + if (eval != null) { + final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true); + IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); + IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); + //logger.info(String.format("Overlap %s found %s", loc, node)); + relevantStates.add( node != null ? "overlaps.intervals" : "outside.intervals"); + } + + return relevantStates; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java index 5cae2fb155..119a1b83f7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java @@ -9,10 +9,15 @@ import java.util.Arrays; import java.util.List; -public abstract class VariantStratifier implements Comparable { +public abstract class VariantStratifier implements Comparable { private VariantEvalWalker variantEvalWalker; + final private String name; protected ArrayList states = new ArrayList(); + protected VariantStratifier() { + name = this.getClass().getSimpleName(); + } + /** * @return a reference to the parent VariantEvalWalker running this stratification */ @@ -34,8 +39,12 @@ public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker t return null; } - public int compareTo(Object o1) { - return this.getClass().getSimpleName().compareTo(o1.getClass().getSimpleName()); + public int compareTo(VariantStratifier o1) { + return this.getName().compareTo(o1.getName()); + } + + public final String getName() { + return name; } public ArrayList getAllStates() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java index 396843252d..90a6b97e0a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java @@ -6,4 +6,5 @@ @Retention(RetentionPolicy.RUNTIME) public @interface DataPoint { String description() default ""; // the description, optional + String format() default ""; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java index 8112ae97f2..c34e445165 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java @@ -21,7 +21,7 @@ public String toString() { String value = ""; for ( VariantStratifier key : this.keySet() ) { - value += "\t" + key.getClass().getSimpleName() + ":" + this.get(key) + "\n"; + value += "\t" + key.getName() + ":" + this.get(key) + "\n"; } return value; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 2cccb0d358..96bd9a9b75 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -1,24 +1,23 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; +import java.util.Map; import java.util.TreeMap; public class StateKey extends TreeMap { - public int hashCode() { - int hashCode = 1; - - for (String key : this.keySet()) { - String value = this.get(key); - - hashCode *= key.hashCode() + value.hashCode(); - } - - return hashCode; - } +// public int hashCode() { +// int hashCode = 1; +// +// for (final Map.Entry pair : this.entrySet()) { +// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); +// } +// +// return hashCode; +// } public String toString() { String value = ""; - for ( String key : this.keySet() ) { + for ( final String key : this.keySet() ) { //value += "\tstate " + key + ":" + this.get(key) + "\n"; value += String.format("%s:%s;", key, this.get(key)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 6a057a456c..cb44ca5222 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.lang.reflect.Field; @@ -195,7 +196,7 @@ public HashMap initializeEvaluationContexts(Set< for (VariantStratifier vs : ec.keySet()) { String state = ec.get(vs); - stateKey.put(vs.getClass().getSimpleName(), state); + stateKey.put(vs.getName(), state); } ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); @@ -229,7 +230,7 @@ public GATKReport initializeGATKReport(Set stratificationObje table.addColumn(tableName, tableName); for (VariantStratifier vs : stratificationObjects) { - String columnName = vs.getClass().getSimpleName(); + String columnName = vs.getName(); table.addColumn(columnName, "unknown"); } @@ -245,7 +246,7 @@ public GATKReport initializeGATKReport(Set stratificationObje field.setAccessible(true); if (!(field.get(vei) instanceof TableType)) { - table.addColumn(field.getName(), 0.0); + table.addColumn(field.getName(), 0.0, datamap.get(field).format()); } } } catch (InstantiationException e) { @@ -266,7 +267,7 @@ public GATKReport initializeGATKReport(Set stratificationObje * @return a new VariantContext with just the requested sample */ public VariantContext getSubsetOfVariantContext(VariantContext vc, String sampleName) { - return getSubsetOfVariantContext(vc, Arrays.asList(sampleName)); + return getSubsetOfVariantContext(vc, Collections.singleton(sampleName)); } /** @@ -276,24 +277,19 @@ public VariantContext getSubsetOfVariantContext(VariantContext vc, String sample * @param sampleNames the samples to pull out of the VariantContext * @return a new VariantContext with just the requested samples */ - public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection sampleNames) { - VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles()); + public VariantContext getSubsetOfVariantContext(VariantContext vc, Set sampleNames) { + VariantContext vcsub = vc.subContextFromSamples(sampleNames, vc.getAlleles()); + VariantContextBuilder builder = new VariantContextBuilder(vcsub); - HashMap newAts = new HashMap(vcsub.getAttributes()); - - int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount(); - int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount(); + final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount(); + final int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount(); if (originalAlleleCount == newAlleleCount && newAlleleCount == 1) { - newAts.put("ISSINGLETON", true); + builder.attribute("ISSINGLETON", true); } - VariantContextUtils.calculateChromosomeCounts(vcsub, newAts, true); - vcsub = VariantContext.modifyAttributes(vcsub, newAts); - - //VariantEvalWalker.logger.debug(String.format("VC %s subset to %s AC%n", vc.getSource(), vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY))); - - return vcsub; + VariantContextUtils.calculateChromosomeCounts(builder, true); + return builder.make(); } /** @@ -301,6 +297,7 @@ public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection, HashMap>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) { + public HashMap, HashMap>> + bindVariantContexts(RefMetaDataTracker tracker, + ReferenceContext ref, + List> tracks, + boolean byFilter, + boolean subsetBySample, + boolean trackPerSample, + boolean mergeTracks) { if ( tracker == null ) return null; - HashMap, HashMap>> bindings = new HashMap, HashMap>>(); + HashMap, HashMap>> bindings = new HashMap, HashMap>>(); + RodBinding firstTrack = tracks.isEmpty() ? null : tracks.get(0); for ( RodBinding track : tracks ) { - HashMap> mapping = new HashMap>(); + HashMap> mapping = new HashMap>(); for ( VariantContext vc : tracker.getValues(track, ref.getLocus()) ) { @@ -346,15 +351,28 @@ public HashMap, HashMap>> } } - bindings.put(track, mapping); + if ( mergeTracks && bindings.containsKey(firstTrack) ) { + // go through each binding of sample -> value and add all of the bindings from this entry + HashMap> firstMapping = bindings.get(firstTrack); + for ( Map.Entry> elt : mapping.entrySet() ) { + Collection firstMappingSet = firstMapping.get(elt.getKey()); + if ( firstMappingSet != null ) { + firstMappingSet.addAll(elt.getValue()); + } else { + firstMapping.put(elt.getKey(), elt.getValue()); + } + } + } else { + bindings.put(track, mapping); + } } return bindings; } - private void addMapping(HashMap> mappings, String sample, VariantContext vc) { + private void addMapping(HashMap> mappings, String sample, VariantContext vc) { if ( !mappings.containsKey(sample) ) - mappings.put(sample, new LinkedHashSet()); + mappings.put(sample, new ArrayList(1)); mappings.get(sample).add(vc); } @@ -393,7 +411,7 @@ public ArrayList initializeStateKeys(HashMap attrs = new HashMap(vc.getAttributes()); + final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); if( lod == null ) { @@ -211,8 +213,8 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC } // Annotate the new record with its VQSLOD and the worst performing annotation - attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); - attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); @@ -231,11 +233,10 @@ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentC } if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { - final Set filters = new HashSet(); - filters.add(filterString); - vc = VariantContext.modifyFilters(vc, filters); + builder.filters(filterString); } - vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs) ); + + vcfWriter.add( builder.make() ); } else { // valid VC but not compatible with this mode, so just emit the variant untouched vcfWriter.add( vc ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 3fa9c3883c..82776ca2e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -52,6 +52,7 @@ public class GaussianMixtureModel { private final double[] empiricalMu; private final Matrix empiricalSigma; public boolean isModelReadyForEvaluation; + public boolean failedToConverge = false; public GaussianMixtureModel( final int numGaussians, final int numAnnotations, final double shrinkage, final double dirichletParameter, final double priorCounts ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e04bfab768..a2782fe343 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; @@ -38,7 +37,6 @@ import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.List; /** @@ -284,7 +282,7 @@ public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 520393898e..7cc5b16252 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -175,7 +175,7 @@ public class VariantRecalibrator extends RodWalker reduceSum ) engine.evaluateData( dataManager.getData(), goodModel, false ); // Generate the negative model using the worst performing data and evaluate each variant contrastively - final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) ); + final ExpandingArrayList negativeTrainingData = dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ); + GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData ); engine.evaluateData( dataManager.getData(), badModel, true ); + + // Detect if the negative model failed to converge because of too few points and/or too many Gaussians and try again + while( badModel.failedToConverge && VRAC.MAX_GAUSSIANS > 4 ) { + logger.info("Negative model failed to converge. Retrying..."); + VRAC.MAX_GAUSSIANS--; + badModel = engine.generateModel( negativeTrainingData ); + engine.evaluateData( dataManager.getData(), goodModel, false ); + engine.evaluateData( dataManager.getData(), badModel, true ); + } + + if( badModel.failedToConverge || goodModel.failedToConverge ) { + throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)"); + } + engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index adfb38a251..6d2ac643ba 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -67,14 +67,20 @@ public GaussianMixtureModel generateModel( final List data ) { public void evaluateData( final List data, final GaussianMixtureModel model, final boolean evaluateContrastively ) { if( !model.isModelReadyForEvaluation ) { - model.precomputeDenominatorForEvaluation(); + try { + model.precomputeDenominatorForEvaluation(); + } catch( Exception e ) { + model.failedToConverge = true; + return; + } } logger.info("Evaluating full set of " + data.size() + " variants..."); for( final VariantDatum datum : data ) { final double thisLod = evaluateDatum( datum, model ); if( Double.isNaN(thisLod) ) { - throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)"); + model.failedToConverge = true; + return; } datum.lod = ( evaluateContrastively ? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index ce03dfffe4..096085330d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; @@ -162,7 +163,7 @@ public class CombineVariants extends RodWalker { private boolean sitesOnlyVCF = false; public void initialize() { - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null); + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit()); if ( PRIORITY_STRING == null ) { PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); @@ -221,7 +222,7 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo for ( final VariantContext vc : vcs ) { vcfWriter.add(vc); } - + return vcs.isEmpty() ? 0 : 1; } @@ -244,18 +245,17 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } - for ( VariantContext mergedVC : mergedVCs ) { + for ( VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) continue; - HashMap attributes = new HashMap(mergedVC.getAttributes()); + final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); // re-compute chromosome counts - VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false); - VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); + VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) - annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY)); - vcfWriter.add(annotatedMergedVC); + VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index c9f330db57..edbfb557ab 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -38,9 +38,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -160,7 +158,7 @@ private int writeLeftAlignedIndel(final VariantContext vc, final ReferenceContex // update if necessary and write if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { int difference = originalIndex - newCigar.getCigarElement(0).getLength(); - VariantContext newVC = VariantContext.modifyLocation(vc, vc.getChr(), vc.getStart()-difference, vc.getEnd()-difference); + VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make(); //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference)); int indelIndex = originalIndex-difference; @@ -210,18 +208,18 @@ public static VariantContext updateAllele(VariantContext vc, Allele newAllele, B } // create new Genotype objects - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getValue().getAlleles() ) { + for ( Allele allele : genotype.getAlleles() ) { Allele newA = alleleMap.get(allele); if ( newA == null ) newA = Allele.NO_CALL; newAlleles.add(newA); } - newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); + newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); } - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), refBaseForIndel); + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).referenceBaseForIndel(refBaseForIndel).make(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index a932d44ed2..50fafa2023 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; @@ -117,16 +118,15 @@ private void convertAndWrite(VariantContext vc, ReferenceContext ref) { vc = VariantContextUtils.reverseComplement(vc); } - vc = VariantContext.modifyLocation(vc, toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length); + vc = new VariantContextBuilder(vc).loc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length).make(); if ( RECORD_ORIGINAL_LOCATION ) { - HashMap attrs = new HashMap(vc.getAttributes()); - attrs.put("OriginalChr", fromInterval.getSequence()); - attrs.put("OriginalStart", fromInterval.getStart()); - vc = VariantContext.modifyAttributes(vc, attrs); + vc = new VariantContextBuilder(vc) + .attribute("OriginalChr", fromInterval.getSequence()) + .attribute("OriginalStart", fromInterval.getStart()).make(); } - VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, false); + VariantContext newVC = VariantContextUtils.createVariantContextWithPaddedAlleles(vc, false); if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 609593acc7..6d94ffe6da 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,16 +24,16 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.apache.poi.hpsf.Variant; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -41,9 +41,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; @@ -145,7 +142,7 @@ * -R ref.fasta \ * -T SelectVariants \ * --variant input.vcf \ - * -family NA12891+NA12892=NA12878 \ + * -bed family.ped \ * -mvq 50 \ * -o violations.vcf * @@ -185,7 +182,7 @@ * * */ -public class SelectVariants extends RodWalker { +public class SelectVariants extends RodWalker implements TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** @@ -255,16 +252,6 @@ public class SelectVariants extends RodWalker { @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; - @Hidden - @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false) - private File FAMILY_STRUCTURE_FILE = null; - - /** - * String formatted as dad+mom=child where these parameters determine which sample names are examined. - */ - @Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) - private String FAMILY_STRUCTURE = ""; - /** * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure. */ @@ -275,8 +262,8 @@ public class SelectVariants extends RodWalker { private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** - * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable - * number of variants. Use --select_random_fraction for larger numbers of variants. + * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory + * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. */ @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) private int numRandom = 0; @@ -287,14 +274,25 @@ public class SelectVariants extends RodWalker { @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) private double fractionRandom = 0; + @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false) + private double fractionGenotypes = 0; + /** - * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. + * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. * When specified one or more times, a particular type of variant is selected. * - */ + */ @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) private List TYPES_TO_INCLUDE = new ArrayList(); + /** + * If provided, we will only include variants whose ID field is present in this list of ids. The matching + * is exact string matching. The file format is just one ID per line + * + */ + @Argument(fullName="keepIDs", shortName="IDs", doc="Only emit sites whose ID is found in this file (one ID per line)", required=false) + private File rsIDFile = null; + @Hidden @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false) @@ -315,9 +313,9 @@ public void set (VariantContext vcP) { } public enum NumberAlleleRestriction { - ALL, - BIALLELIC, - MULTIALLELIC + ALL, + BIALLELIC, + MULTIALLELIC } private ArrayList selectedTypes = new ArrayList(); @@ -330,7 +328,7 @@ public enum NumberAlleleRestriction { private boolean DISCORDANCE_ONLY = false; private boolean CONCORDANCE_ONLY = false; - private Set mvSet = new HashSet(); + private MendelianViolation mv; /* variables used by the SELECT RANDOM modules */ @@ -341,14 +339,12 @@ public enum NumberAlleleRestriction { private int positionToAdd = 0; private RandomVariantStructure [] variantArray; - - /* Variables used for random selection with AF boosting */ - private ArrayList afBreakpoints = null; - private ArrayList afBoosts = null; - double bkDelta = 0.0; - private PrintStream outMVFileStream = null; + //Random number generator for the genotypes to remove + private Random randomGenotypes = new Random(); + + private Set IDsToKeep = null; /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher @@ -385,8 +381,6 @@ public void initialize() { for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); - - // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include if (TYPES_TO_INCLUDE.isEmpty()) { @@ -426,29 +420,7 @@ public void initialize() { if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName()); if (MENDELIAN_VIOLATIONS) { - if ( FAMILY_STRUCTURE_FILE != null) { - try { - for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) { - MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD); - if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom())) - mvSet.add(mv); - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); - } - if (outMVFile != null) - try { - outMVFileStream = new PrintStream(outMVFile); - } - catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); } - } - else - mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); - } - else if (!FAMILY_STRUCTURE.isEmpty()) { - mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); - MENDELIAN_VIOLATIONS = true; + mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); } SELECT_RANDOM_NUMBER = numRandom > 0; @@ -461,7 +433,18 @@ else if (!FAMILY_STRUCTURE.isEmpty()) { if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); - + /** load in the IDs file to a hashset for matching */ + if ( rsIDFile != null ) { + IDsToKeep = new HashSet(); + try { + for ( final String line : new XReadLines(rsIDFile).readLines() ) { + IDsToKeep.add(line.trim()); + } + logger.info("Selecting only variants with one of " + IDsToKeep.size() + " IDs from " + rsIDFile); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(rsIDFile, e); + } + } } /** @@ -484,35 +467,38 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo } for (VariantContext vc : vcs) { - if (MENDELIAN_VIOLATIONS) { - boolean foundMV = false; - for (MendelianViolation mv : mvSet) { - if (mv.isViolation(vc)) { - foundMV = true; - //System.out.println(vc.toString()); - if (outMVFile != null) + if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) ) + continue; + + if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1) + break; + + if (outMVFile != null){ + for( String familyId : mv.getViolationFamilies()){ + for(Sample sample : this.getSampleDB().getFamily(familyId)){ + if(sample.getParents().size() > 0){ outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + - "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), - vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)), - mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), - vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), - vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), - vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() ); + "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), + vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getCalledChrCount(vc.getAlternateAllele(0)), + sample.getMaternalID(), sample.getPaternalID(), sample.getID(), + vc.getGenotype(sample.getMaternalID()).toBriefString(), vc.getGenotype(sample.getMaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getPaternalID()).toBriefString(), vc.getGenotype(sample.getPaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getID()).toBriefString(),vc.getGenotype(sample.getID()).getLikelihoods().getAsString() ); + + } } } - - if (!foundMV) - break; } + if (DISCORDANCE_ONLY) { Collection compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) - return 0; + continue; } if (CONCORDANCE_ONLY) { Collection compVCs = tracker.getValues(concordanceTrack, context.getLocation()); if (!isConcordant(vc, compVCs)) - return 0; + continue; } if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) @@ -525,22 +511,23 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo continue; VariantContext sub = subsetRecord(vc, samples); - if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { + if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { + boolean failedJexlMatch = false; for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { if ( !VariantContextUtils.match(sub, jexl) ) { - return 0; + failedJexlMatch = true; + break; } } - if (SELECT_RANDOM_NUMBER) { - randomlyAddVariant(++variantNumber, sub, ref.getBase()); - } - else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { - vcfWriter.add(sub); + if ( !failedJexlMatch ) { + if (SELECT_RANDOM_NUMBER) { + randomlyAddVariant(++variantNumber, sub); + } + else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { + vcfWriter.add(sub); + } } - - } - } return 1; @@ -561,8 +548,8 @@ private boolean isDiscordant (VariantContext vc, Collection comp return (compVCs == null || compVCs.isEmpty()); // check if we find it in the variant rod - Map genotypes = vc.getGenotypes(samples); - for (Genotype g : genotypes.values()) { + GenotypesContext genotypes = vc.getGenotypes(samples); + for (final Genotype g : genotypes) { if (sampleHasVariant(g)) { // There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples. if (compVCs == null) @@ -634,6 +621,11 @@ private boolean haveSameGenotypes(Genotype g1, Genotype g2) { @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + public void onTraversalDone(Integer result) { logger.info(result + " records processed."); @@ -659,19 +651,34 @@ private VariantContext subsetRecord(VariantContext vc, Set samples) { if ( samples == null || samples.isEmpty() ) return vc; - ArrayList genotypes = new ArrayList(); - for ( Map.Entry genotypePair : vc.getGenotypes().entrySet() ) { - if ( samples.contains(genotypePair.getKey()) ) - genotypes.add(genotypePair.getValue()); - } + final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles()); + VariantContextBuilder builder = new VariantContextBuilder(sub); - VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); + GenotypesContext newGC = sub.getGenotypes(); // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes())); + newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); + + //Remove a fraction of the genotypes if needed + if(fractionGenotypes>0){ + ArrayList genotypes = new ArrayList(); + for ( Genotype genotype : newGC ) { + //Set genotype to no call if it falls in the fraction. + if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = new ArrayList(2); + alleles.add(Allele.create((byte)'.')); + alleles.add(Allele.create((byte)'.')); + genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap(),false)); + } + else{ + genotypes.add(genotype); + } + } + newGC = GenotypesContext.create(genotypes); + } - HashMap attributes = new HashMap(sub.getAttributes()); + builder.genotypes(newGC); int depth = 0; for (String sample : sub.getSampleNames()) { @@ -688,24 +695,21 @@ private VariantContext subsetRecord(VariantContext vc, Set samples) { if (KEEP_ORIGINAL_CHR_COUNTS) { - if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) - attributes.put("AC_Orig",attributes.get(VCFConstants.ALLELE_COUNT_KEY)); - if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) - attributes.put("AF_Orig",attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY)); - if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) - attributes.put("AN_Orig",attributes.get(VCFConstants.ALLELE_NUMBER_KEY)); - + if ( sub.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) + builder.attribute("AC_Orig", sub.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); + if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) + builder.attribute("AF_Orig", sub.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + if ( sub.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) + builder.attribute("AN_Orig", sub.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); } - VariantContextUtils.calculateChromosomeCounts(sub,attributes,false); - attributes.put("DP", depth); - - sub = VariantContext.modifyAttributes(sub, attributes); + VariantContextUtils.calculateChromosomeCounts(builder, false); + builder.attribute("DP", depth); - return sub; + return builder.make(); } - private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { + private void randomlyAddVariant(int rank, VariantContext vc) { if (nVariantsAdded < numRandom) variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 4e6cc722db..31aa8963b5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; @@ -227,24 +228,24 @@ private VariantContext addVariantInformationToCall(VariantContext vContext) { numHomVarViolations++; isViolation = true; } - vContext = VariantContext.modifyFilters(vContext, filters); + + VariantContextBuilder builder = new VariantContextBuilder(vContext).filters(filters); numRecords++; // add the info fields - HashMap infoMap = new HashMap(); - infoMap.put("NoCallPct", String.format("%.1f", 100.0*noCallProp)); - infoMap.put("HomRefPct", String.format("%.1f", 100.0*homRefProp)); - infoMap.put("HomVarPct", String.format("%.1f", 100.0*homVarProp)); - infoMap.put("HetPct", String.format("%.1f", 100.0*hetProp)); - infoMap.put("HW", String.format("%.2f", hwScore)); + builder.attribute("NoCallPct", String.format("%.1f", 100.0 * noCallProp)); + builder.attribute("HomRefPct", String.format("%.1f", 100.0 * homRefProp)); + builder.attribute("HomVarPct", String.format("%.1f", 100.0 * homVarProp)); + builder.attribute("HetPct", String.format("%.1f", 100.0 * hetProp)); + builder.attribute("HW", String.format("%.2f", hwScore)); Collection altAlleles = vContext.getAlternateAlleles(); - int altAlleleCount = altAlleles.size() == 0 ? 0 : vContext.getChromosomeCount(altAlleles.iterator().next()); + int altAlleleCount = altAlleles.size() == 0 ? 0 : vContext.getCalledChrCount(altAlleles.iterator().next()); if ( !isViolation && altAlleleCount > 0 ) numTrueVariants++; - infoMap.put(VCFConstants.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount)); - infoMap.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount())); + builder.attribute(VCFConstants.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount)); + builder.attribute(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", vContext.getCalledChrCount())); - return VariantContext.modifyAttributes(vContext, infoMap); + return builder.make(); } private double hardyWeinbergCalculation(VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 4549096340..4b3aa4864b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -127,12 +127,12 @@ public class VariantsToTable extends RodWalker { * multi-allelic INFO field values can be lists of values. */ @Advanced - @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) - public boolean keepMultiAllelic = false; + @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) + public boolean keepMultiAllelic = false; @Hidden @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false) - public boolean logACSum = false; + public boolean logACSum = false; /** * By default, this tool throws a UserException when it encounters a field without a value in some record. This @@ -315,7 +315,7 @@ public String get(VariantContext vc) { getters.put("FILTER", new Getter() { public String get(VariantContext vc) { return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } }); - getters.put("ID", new Getter() { public String get(VariantContext vc) { return vc.hasID() ? vc.getID() : "."; } }); + getters.put("ID", new Getter() { public String get(VariantContext vc) { return vc.getID(); } }); getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); @@ -326,7 +326,7 @@ public String get(VariantContext vc) { getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); getters.put("GQ", new Getter() { public String get(VariantContext vc) { if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); - return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); + return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError()); }}); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 9b33f85374..f5928b7234 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -42,10 +42,7 @@ import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.util.*; @@ -124,25 +121,22 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo Collection contexts = getVariantContexts(tracker, ref); for ( VariantContext vc : contexts ) { - Map attrs = new HashMap(vc.getAttributes()); - if ( rsID != null && !vc.hasID() ) { - attrs.put(VariantContext.ID_KEY, rsID); - vc = VariantContext.modifyAttributes(vc, attrs); + VariantContextBuilder builder = new VariantContextBuilder(vc); + if ( rsID != null && vc.emptyID() ) { + builder.id(rsID).make(); } // set the appropriate sample name if necessary if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) { Genotype g = Genotype.modifyName(vc.getGenotype(variants.getName()), sampleName); - Map genotypes = new HashMap(); - genotypes.put(sampleName, g); - vc = VariantContext.modifyGenotypes(vc, genotypes); + builder.genotypes(g); } if ( fixReferenceBase ) { - vc = VariantContext.modifyReferencePadding(vc, ref.getBase()); + builder.referenceBaseForIndel(ref.getBase()); } - writeRecord(vc, tracker, ref.getLocus()); + writeRecord(builder.make(), tracker, ref.getLocus()); } return 1; @@ -207,7 +201,7 @@ private VariantContext getDbsnp(String rsID) { while ( dbsnpIterator.hasNext() ) { GATKFeature feature = dbsnpIterator.next(); VariantContext vc = (VariantContext)feature.getUnderlyingObject(); - if ( vc.hasID() && vc.getID().equals(rsID) ) + if ( vc.getID().equals(rsID) ) return vc; } diff --git a/public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java b/public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java deleted file mode 100644 index 8d33047bfa..0000000000 --- a/public/java/src/org/broadinstitute/sting/pipeline/PipelineProject.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.pipeline; - -import java.io.File; -import java.util.Map; -import java.util.TreeMap; - -/** - * Java bean defining the project for a pipeline. - */ -public class PipelineProject { - private String name; - private File referenceFile; - private File intervalList; - private File genotypeDbsnp; - private File evalDbsnp; - private File refseqTable; - private Map tags = new TreeMap(); - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public File getIntervalList() { - return intervalList; - } - - public void setIntervalList(File intervalList) { - this.intervalList = intervalList; - } - - public File getReferenceFile() { - return referenceFile; - } - - public void setReferenceFile(File referenceFile) { - this.referenceFile = referenceFile; - } - - public File getGenotypeDbsnp() { - return genotypeDbsnp; - } - - public void setGenotypeDbsnp(File genotypeDbsnp) { - this.genotypeDbsnp = genotypeDbsnp; - } - - public String getGenotypeDbsnpType() { - return getDbsnpType(genotypeDbsnp); - } - - public File getEvalDbsnp() { - return evalDbsnp; - } - - public void setEvalDbsnp(File evalDbsnp) { - this.evalDbsnp = evalDbsnp; - } - - public String getEvalDbsnpType() { - return getDbsnpType(evalDbsnp); - } - - public File getRefseqTable() { - return refseqTable; - } - - public void setRefseqTable(File refseqTable) { - this.refseqTable = refseqTable; - } - - public Map getTags() { - return tags; - } - - public void setTags(Map tags) { - this.tags = tags; - } - - private String getDbsnpType(File file) { - if (file == null) - return null; - else if (file.getName().toLowerCase().endsWith(".vcf")) - return "vcf"; - else - return "dbsnp"; - } -} diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java index cb5bad4aed..cdfc329e81 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java @@ -83,10 +83,10 @@ else if(getShortFieldGetter().equals(getFieldName())) getShortFieldSetter()); } - protected static final String REQUIRED_TEMPLATE = " + \" %1$s \" + %2$s.format(%3$s)"; - protected static final String REPEAT_TEMPLATE = " + repeat(\" %1$s \", %3$s, format=formatValue(%2$s))"; - protected static final String OPTIONAL_TEMPLATE = " + optional(\" %1$s \", %3$s, format=formatValue(%2$s))"; - protected static final String FLAG_TEMPLATE = " + (if (%3$s) \" %1$s\" else \"\")"; + protected static final String REQUIRED_TEMPLATE = " + required(\"%1$s\", %3$s, spaceSeparated=true, escape=true, format=%2$s)"; + protected static final String REPEAT_TEMPLATE = " + repeat(\"%1$s\", %3$s, spaceSeparated=true, escape=true, format=%2$s)"; + protected static final String OPTIONAL_TEMPLATE = " + optional(\"%1$s\", %3$s, spaceSeparated=true, escape=true, format=%2$s)"; + protected static final String FLAG_TEMPLATE = " + conditional(%3$s, \"%1$s\", escape=true, format=%2$s)"; public final String getCommandLineAddition() { return String.format(getCommandLineTemplate(), getCommandLineParam(), getCommandLineFormat(), getFieldName()); @@ -136,7 +136,7 @@ private static List getArgumentFields(ArgumentDefinitio new IntervalStringArgumentField(argumentDefinition)); // ROD Bindings are set by the RodBindField - } else if (RodBindField.ROD_BIND_FIELD.equals(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) { + } else if (RodBindArgumentField.ROD_BIND_FIELD.equals(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) { // TODO: Once everyone is using @Allows and @Requires correctly, we can stop blindly allowing Triplets return Arrays.asList(new RodBindArgumentField(argumentDefinition), new InputIndexesArgumentField(argumentDefinition, Tribble.STANDARD_INDEX_EXTENSION)); //return Collections.emptyList(); @@ -337,6 +337,8 @@ public DefaultArgumentField(ArgumentDefinition argumentDefinition, boolean useFo // Allows the user to specify the track name, track type, and the file. public static class RodBindArgumentField extends ArgumentDefinitionField { + public static final String ROD_BIND_FIELD = "rodBind"; + public RodBindArgumentField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); } @@ -344,7 +346,7 @@ public RodBindArgumentField(ArgumentDefinition argumentDefinition) { @Override protected String getFieldType() { return "List[RodBind]"; } @Override protected String getDefaultValue() { return "Nil"; } @Override protected String getCommandLineTemplate() { - return " + repeat(\"\", %3$s, format=RodBind.formatCommandLine(\"%1$s\"))"; + return " + repeat(\"%1$s\", %3$s, formatPrefix=RodBind.formatCommandLineParameter, spaceSeparated=true, escape=true, format=%2$s)"; } } @@ -358,11 +360,11 @@ public InputTaggedFileDefinitionField(ArgumentDefinition argumentDefinition) { @Override protected String getDefaultValue() { return argumentDefinition.isMultiValued ? "Nil" : "_"; } @Override protected String getCommandLineTemplate() { if (argumentDefinition.isMultiValued) { - return " + repeat(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + return " + repeat(\"%1$s\", %3$s, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format=%2$s)"; } else if (!argumentDefinition.required) { - return " + optional(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + return " + optional(TaggedFile.formatCommandLineParameter(\"%1$s\", %3$s), %3$s, spaceSeparated=true, escape=true, format=%2$s)"; } else { - return " + TaggedFile.formatCommandLine(\"%1$s\")(\"\", %3$s, \"\")"; + return " + required(TaggedFile.formatCommandLineParameter(\"%1$s\", %3$s), %3$s, spaceSeparated=true, escape=true, format=%2$s)"; } } } diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index 9578eda84a..9c40fb976a 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -147,7 +147,7 @@ protected int execute() { String clpConstructor = String.format("analysisName = \"%s\"%njavaMainClass = \"%s\"%n", clpClassName, clp.getName()); writeClass("org.broadinstitute.sting.queue.function.JavaCommandLineFunction", clpClassName, - false, clpConstructor, ArgumentDefinitionField.getArgumentFields(parser,clp), dependents); + false, clpConstructor, ArgumentDefinitionField.getArgumentFields(parser,clp), dependents, false); if (clp == CommandLineGATK.class) { for (Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(false).entrySet()) { @@ -169,7 +169,7 @@ protected int execute() { } writeClass(GATK_EXTENSIONS_PACKAGE_NAME + "." + clpClassName, walkerName, - isScatter, constructor, argumentFields, dependents); + isScatter, constructor, argumentFields, dependents, true); } catch (Exception e) { throw new ReviewedStingException("Error generating wrappers for walker " + walkerType, e); } @@ -241,8 +241,9 @@ private String getScatterClass(Class walkerType) { * @throws IOException If the file cannot be written. */ private void writeClass(String baseClass, String className, boolean isScatter, - String constructor, List argumentFields, Set> dependents) throws IOException { - String content = getContent(CLASS_TEMPLATE, baseClass, className, constructor, isScatter, "", argumentFields, dependents); + String constructor, List argumentFields, + Set> dependents, boolean isGATKWalker) throws IOException { + String content = getContent(CLASS_TEMPLATE, baseClass, className, constructor, isScatter, "", argumentFields, dependents, isGATKWalker); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } @@ -256,7 +257,7 @@ private void writeClass(String baseClass, String className, boolean isScatter, */ private void writeFilter(String className, List argumentFields, Set> dependents) throws IOException { String content = getContent(TRAIT_TEMPLATE, "org.broadinstitute.sting.queue.function.CommandLineFunction", - className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents); + className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents, false); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } @@ -350,7 +351,8 @@ private void writeFile(String fullClassName, String content) throws IOException */ private static String getContent(String scalaTemplate, String baseClass, String className, String constructor, boolean isScatter, String commandLinePrefix, - List argumentFields, Set> dependents) { + List argumentFields, Set> dependents, + boolean isGATKWalker) { StringBuilder arguments = new StringBuilder(); StringBuilder commandLine = new StringBuilder(commandLinePrefix); @@ -374,6 +376,9 @@ private static String getContent(String scalaTemplate, String baseClass, String if (isGather) importSet.add("import org.broadinstitute.sting.commandline.Gather"); + // Needed for ShellUtils.escapeShellArgument() + importSet.add("import org.broadinstitute.sting.queue.util.ShellUtils"); + // Sort the imports so that the are always in the same order. List sortedImports = new ArrayList(importSet); Collections.sort(sortedImports); @@ -381,8 +386,10 @@ private static String getContent(String scalaTemplate, String baseClass, String StringBuffer freezeFieldOverride = new StringBuffer(); for (String freezeField: freezeFields) freezeFieldOverride.append(freezeField); - if (freezeFieldOverride.length() > 0) { + if (freezeFieldOverride.length() > 0 || isGATKWalker) { freezeFieldOverride.insert(0, String.format("override def freezeFieldValues = {%nsuper.freezeFieldValues%n")); + if ( isGATKWalker ) + freezeFieldOverride.append(String.format("if ( num_threads.isDefined ) nCoresRequest = num_threads%n")); freezeFieldOverride.append(String.format("}%n%n")); } diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java deleted file mode 100644 index baf0835756..0000000000 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.queue.extensions.gatk; - -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Walker; - -import java.io.File; -import java.lang.annotation.Annotation; -import java.util.ArrayList; -import java.util.List; - -/** - * Allows user to specify the rod file but locks in the track name and the track type. - */ -public class RodBindField extends ArgumentField { - public static final String ROD_BIND_FIELD = "rodBind"; - - private final String trackName; - private final String typeName; - private final List relatedFields; - private final boolean isRequired; - - public RodBindField(String trackName, String typeName, List relatedFields, boolean isRequired) { - this.trackName = trackName; - this.typeName = typeName; - this.relatedFields = relatedFields; - this.isRequired = isRequired; - } - - @SuppressWarnings("unchecked") - @Override protected Class getAnnotationIOClass() { return Input.class; } - @Override protected Class getInnerType() { return File.class; } - @Override protected String getFullName() { return escape(getRawFieldName()); } - @Override protected String getFieldType() { return "File"; } - @Override protected String getDefaultValue() { return "_"; } - @Override protected String getRawFieldName() { return this.trackName + this.typeName; } - @Override protected String getDoc() { return escape(this.typeName + " " + this.trackName); } - @Override protected boolean isRequired() { return this.isRequired; } - - @Override public String getCommandLineAddition() { - // TODO: Stop allowing the generic "rodBind" triplets to satisfy the requirement after @Requires are fixed. - return String.format(" + optional(\" -B:%s,%s \", %s)", - /* - return String.format(this.useOption() - ? " + optional(\" -B:%s,%s \", %s)" - : " + \" -B:%s,%s \" + %s", - */ - this.trackName, this.typeName, getFieldName()); - } - - private boolean useOption() { - return !this.isRequired || (relatedFields.size() > 1); - } - - @Override protected String getExclusiveOf() { - StringBuilder exclusiveOf = new StringBuilder(); - // TODO: Stop allowing the generic "rodBind" triplets to satisfy the requirement after @Requires are fixed. - if (this.isRequired) - exclusiveOf.append(ROD_BIND_FIELD); - for (RodBindField relatedField: relatedFields) - if (relatedField != this) { - if (exclusiveOf.length() > 0) - exclusiveOf.append(","); - exclusiveOf.append(relatedField.getFieldName()); - } - return exclusiveOf.toString(); - } -// -// public static List getRodArguments(Class walkerClass, RMDTrackBuilder trackBuilder) { -// List argumentFields = new ArrayList(); -// -// List requires = WalkerManager.getRequiredMetaData(walkerClass); -// List allows = WalkerManager.getAllowsMetaData(walkerClass); -// -// for (RMD required: requires) { -// List fields = new ArrayList(); -// String trackName = required.name(); -// if ("*".equals(trackName)) { -// // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers -// //fields.add(new RodBindArgumentField(argumentDefinition, true)); -// } else { -// for (String typeName: trackBuilder.getFeatureManager().getTrackRecordTypeNames(required.type())) -// fields.add(new RodBindField(trackName, typeName, fields, true)); -// } -// argumentFields.addAll(fields); -// } -// -// for (RMD allowed: allows) { -// List fields = new ArrayList(); -// String trackName = allowed.name(); -// if ("*".equals(trackName)) { -// // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers -// //fields.add(new RodBindArgumentField(argumentDefinition, false)); -// } else { -// for (String typeName: trackBuilder.getFeatureManager().getTrackRecordTypeNames(allowed.type())) -// fields.add(new RodBindField(trackName, typeName, fields, true)); -// } -// argumentFields.addAll(fields); -// } -// -// return argumentFields; -// } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index c1479bc691..345161416b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -440,4 +440,29 @@ public long size() { return stop - start + 1; } + /** + * reciprocialOverlap: what is the min. percent of gl1 and gl2 covered by both + * + * gl1.s ---------- gk1.e + * gl2.s ---------- gl2.e + * 100% + * + * gl1.s ---------- gk1.e + * gl2.s ---------- gl2.e + * 50% + * + * gl1.s ---------- gk1.e + * gl2.s -------------------- gl2.e + * 25% (50% for gl1 but only 25% for gl2) + */ + public final double reciprocialOverlapFraction(final GenomeLoc o) { + if ( overlapsP(o) ) + return Math.min(overlapPercent(this, o), overlapPercent(o, this)); + else + return 0.0; + } + + private final static double overlapPercent(final GenomeLoc gl1, final GenomeLoc gl2) { + return (1.0 * gl1.intersect(gl2).size()) / gl1.size(); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index a06a7166f1..4a5032936e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -35,8 +35,10 @@ import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; /** * Factory class for creating GenomeLocs @@ -85,12 +87,12 @@ public final int getNSequences() { @Requires("contig != null") public synchronized boolean hasContig(final String contig) { - return lastContig == contig || dict.getSequence(contig) != null; + return contig.equals(lastContig) || dict.getSequence(contig) != null; } @Requires("index >= 0") public synchronized boolean hasContig(final int index) { - return lastIndex == index|| dict.getSequence(index) != null; + return lastIndex == index || dict.getSequence(index) != null; } @Requires("contig != null") @@ -453,6 +455,28 @@ public GenomeLoc createGenomeLoc(final Feature feature) { return createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()); } + /** + * Creates a GenomeLoc corresponding to the variant context vc. If includeSymbolicEndIfPossible + * is true, and VC is a symbolic allele the end of the created genome loc will be the value + * of the END info field key, if it exists, or vc.getEnd() if not. + * + * @param vc + * @param includeSymbolicEndIfPossible + * @return + */ + public GenomeLoc createGenomeLoc(final VariantContext vc, boolean includeSymbolicEndIfPossible) { + if ( includeSymbolicEndIfPossible && vc.isSymbolic() ) { + int end = vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getEnd()); + return createGenomeLoc(vc.getChr(), vc.getStart(), end); + } + else + return createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd()); + } + + public GenomeLoc createGenomeLoc(final VariantContext vc) { + return createGenomeLoc(vc, false); + } + /** * create a new genome loc, given the contig name, and a single position. Must be on the reference * @@ -530,4 +554,54 @@ public GenomeLoc createOverEntireContig(String contigName) { return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); } + /** + * Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + + int start = loc.getStart() - maxBasePairs; + int stop = loc.getStart() - 1; + + if (start < 1) + start = 1; + if (stop < 1) + return null; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } + + /** + * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + int contigLength = contig.getSequenceLength(); + + int start = loc.getStop() + 1; + int stop = loc.getStop() + maxBasePairs; + + if (start > contigLength) + return null; + if (stop > contigLength) + stop = contigLength; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index ce2ca2c287..df682f215f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -33,8 +33,8 @@ import java.util.List; public class Haplotype { - protected byte[] bases = null; - protected double[] quals = null; + protected final byte[] bases; + protected final double[] quals; private GenomeLoc genomeLocation = null; private boolean isReference = false; @@ -69,6 +69,11 @@ public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) { this.isReference = isRef; } + @Override + public boolean equals( Object h ) { + return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases); + } + public double getQualitySum() { double s = 0; for (int k=0; k < bases.length; k++) { @@ -88,7 +93,7 @@ public String toString() { public double[] getQuals() { return quals; } - public byte[] getBasesAsBytes() { + public byte[] getBases() { return bases; } @@ -100,7 +105,6 @@ public long getStopPosition() { return genomeLocation.getStop(); } - public boolean isReference() { return isReference; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 17f458f31b..759e1649df 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -188,6 +188,10 @@ public static boolean wellFormedDouble(double val) { return ! Double.isInfinite(val) && ! Double.isNaN(val); } + public static double bound(double value, double minBoundary, double maxBoundary) { + return Math.max(Math.min(value, maxBoundary), minBoundary); + } + public static boolean isBounded(double val, double lower, double upper) { return val >= lower && val <= upper; } @@ -1029,14 +1033,15 @@ public void merge(RunningAverage other) { public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0/JACOBIAN_LOG_TABLE_STEP; public static final double MAX_JACOBIAN_TOLERANCE = 10.0; - private static final int MAXN = 10000; + private static final int MAXN = 11000; + private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients static { - log10Cache = new double[2*MAXN]; + log10Cache = new double[LOG10_CACHE_SIZE]; jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k=1; k < 2*MAXN; k++) + for (int k=1; k < LOG10_CACHE_SIZE; k++) log10Cache[k] = Math.log10(k); for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index cf45dab79c..b9c209e698 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -1,147 +1,394 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Collection; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.*; /** - * User: carneiro + * User: carneiro / lfran * Date: 3/9/11 * Time: 12:38 PM + * + * Class for the identification and tracking of mendelian violation. It can be used in 2 distinct ways: + * - Either using an instance of the MendelianViolation class to track mendelian violations for each of the families while + * walking over the variants + * - Or using the static methods to directly get information about mendelian violation in a family at a given locus + * */ public class MendelianViolation { - String sampleMom; - String sampleDad; - String sampleChild; + //List of families with violations + private List violationFamilies; - List allelesMom; - List allelesDad; - List allelesChild; + //Call information + private int nocall = 0; + private int familyCalled = 0; + private int varFamilyCalled = 0; + private int lowQual = 0; - double minGenotypeQuality; + private boolean allCalledOnly = true; - static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; - static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; + //Stores occurrences of inheritance + private EnumMap>> inheritance; + + private int violations_total=0; + + private double minGenotypeQuality; + + private boolean abortOnSampleNotFound; + + //Number of families with genotype information for all members + public int getFamilyCalledCount(){ + return familyCalled; + } + + //Number of families with genotype information for all members + public int getVarFamilyCalledCount(){ + return varFamilyCalled; + } + + //Number of families missing genotypes for one or more of their members + public int getFamilyNoCallCount(){ + return nocall; + } + + //Number of families with genotypes below the set quality threshold + public int getFamilyLowQualsCount(){ + return lowQual; + } + + public int getViolationsCount(){ + return violations_total; + } + + //Count of alt alleles inherited from het parents (no violation) + public int getParentHetInheritedVar(){ + return getParentsHetHetInheritedVar() + getParentsRefHetInheritedVar() + getParentsVarHetInheritedVar(); + } + + //Count of ref alleles inherited from het parents (no violation) + public int getParentHetInheritedRef(){ + return getParentsHetHetInheritedRef() + getParentsRefHetInheritedRef() + getParentsVarHetInheritedRef(); + } + + //Count of HomRef/HomRef/HomRef trios + public int getRefRefRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + } + + //Count of HomVar/HomVar/HomVar trios + public int getVarVarVar(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); + } + + //Count of HomRef/HomVar/Het trios + public int getRefVarHet(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + } + + //Count of Het/Het/Het trios + public int getHetHetHet(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET); + } + + //Count of Het/Het/HomRef trios + public int getHetHetHomRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + } + + //Count of Het/Het/HomVar trios + public int getHetHetHomVar(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); + } + + //Count of ref alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) + + 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + //return parentsHetHet_childRef; + } + + //Count of var alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedVar(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) + + 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); + //return parentsHetHet_childVar; + } + + //Count of ref alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + //return parentsHomRefHet_childRef; + } + + //Count of var alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + //return parentsHomRefHet_childVar; + } + + //Count of ref alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedRef(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); + //return parentsHomVarHet_childRef; + } + + //Count of var alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedVar(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); + //return parentsHomVarHet_childVar; + } + + //Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR + public int getParentsRefRefChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_REF -> HET + public int getParentsRefRefChildHet(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + } + + //Count of violations of the type HOM_REF/HET -> HOM_VAR + public int getParentsRefHetChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } - private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR + public int getParentsRefVarChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } - public String getSampleMom() { - return sampleMom; + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF + public int getParentsRefVarChildRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); } - public String getSampleDad() { - return sampleDad; + + //Count of violations of the type HOM_VAR/HET -> HOM_REF + public int getParentsVarHetChildRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); } - public String getSampleChild() { - return sampleChild; + + //Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF + public int getParentsVarVarChildRef(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF); } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HET + public int getParentsVarVarChildHet(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); + } + + + //Count of violations of the type HOM_VAR/? -> HOM_REF + public int getParentVarChildRef(){ + return getParentsRefVarChildRef() + getParentsVarHetChildRef() +getParentsVarVarChildRef(); + } + + //Count of violations of the type HOM_REF/? -> HOM_VAR + public int getParentRefChildVar(){ + return getParentsRefVarChildVar() + getParentsRefHetChildVar() +getParentsRefRefChildVar(); + } + + //Returns a String containing all trios where a Mendelian violation was observed. + //The String is formatted "mom1+dad1=child1,mom2+dad2=child2,..." + public String getViolationFamiliesString(){ + if(violationFamilies.isEmpty()) + return ""; + + Iterator it = violationFamilies.iterator(); + String violationFams = it.next(); + while(it.hasNext()){ + violationFams += ","+it.next(); + } + return violationFams; + } + + public List getViolationFamilies(){ + return violationFamilies; + } + + static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; + static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; + public double getMinGenotypeQuality() { return minGenotypeQuality; } - /** + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation * - * @param sampleMomP - sample name of mom - * @param sampleDadP - sample name of dad - * @param sampleChildP - sample name of child */ - public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) { - sampleMom = sampleMomP; - sampleDad = sampleDadP; - sampleChild = sampleChildP; + public MendelianViolation(double minGenotypeQualityP) { + this(minGenotypeQualityP,true); } /** - * - * @param family - the sample names string "mom+dad=child" + * Constructor * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. */ - public MendelianViolation(String family, double minGenotypeQualityP) { + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound) { minGenotypeQuality = minGenotypeQualityP; - - Matcher m = FAMILY_PATTERN.matcher(family); - if (m.matches()) { - sampleMom = m.group(1); - sampleDad = m.group(2); - sampleChild = m.group(3); - } - else - throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); } /** - * An alternative to the more general constructor if you want to get the Sample information from the engine yourself. - * @param sample - the sample object extracted from the sample metadata YAML file given to the engine. + * Constructor * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + * @param completeTriosOnly - whether only complete trios are considered or parent/child pairs are too. */ - public MendelianViolation(Sample sample, double minGenotypeQualityP) { - sampleMom = sample.getMother().getID(); - sampleDad = sample.getFather().getID(); - sampleChild = sample.getID(); + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound, boolean completeTriosOnly) { minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + allCalledOnly = completeTriosOnly; } /** - * This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to - * isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site - * before evaluating it for mendelian violation, you can call setAlleles and then isViolation(). - * @param vc - the variant context to extract the genotypes and alleles for mom, dad and child. - * @return false if couldn't find the genotypes or context has empty alleles. True otherwise. + * @param families the families to be checked for Mendelian violations + * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. + * @return whether or not there is a mendelian violation at the site. */ - public boolean setAlleles (VariantContext vc) - { - Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom); - Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad); - Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild); - - if (gMom == null || gDad == null || gChild == null) - throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild)); + public int countViolations(Map> families, VariantContext vc){ - if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() || - gMom.getPhredScaledQual() < minGenotypeQuality || - gDad.getPhredScaledQual() < minGenotypeQuality || - gChild.getPhredScaledQual() < minGenotypeQuality ) { + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); - return false; + for(Set family : families.values()){ + Iterator sampleIterator = family.iterator(); + Sample sample; + while(sampleIterator.hasNext()){ + sample = sampleIterator.next(); + if(sample.getParents().size() > 0) + updateViolations(sample.getFamilyID(),sample.getMaternalID(), sample.getPaternalID(), sample.getID() ,vc); + } } + return violations_total; + } - allelesMom = gMom.getAlleles(); - allelesDad = gDad.getAlleles(); - allelesChild = gChild.getAlleles(); - return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty(); + public boolean isViolation(Sample mother, Sample father, Sample child, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + updateViolations(mother.getFamilyID(),mother.getID(),father.getID(),child.getID(),vc); + return violations_total>0; } - /** - * - * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. - * @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation. - * - */ - public boolean isViolation(VariantContext vc) - { - return setAlleles(vc) && isViolation(); + private void updateViolations(String familyId, String motherId, String fatherId, String childId, VariantContext vc){ + + int count; + Genotype gMom = vc.getGenotype(motherId); + Genotype gDad = vc.getGenotype(fatherId); + Genotype gChild = vc.getGenotype(childId); + + if (gMom == null || gDad == null || gChild == null){ + if(abortOnSampleNotFound) + throw new IllegalArgumentException(String.format("Variant %s:%d: Missing genotypes for family %s: mom=%s dad=%s family=%s", vc.getChr(), vc.getStart(), familyId, motherId, fatherId, childId)); + else + return; + } + //Count No calls + if(allCalledOnly && (!gMom.isCalled() || !gDad.isCalled() || !gChild.isCalled())){ + nocall++; + } + else if (!gMom.isCalled() && !gDad.isCalled() || !gChild.isCalled()){ + nocall++; + } + //Count lowQual. Note that if min quality is set to 0, even values with no quality associated are returned + else if (minGenotypeQuality>0 && (gMom.getPhredScaledQual() < minGenotypeQuality || + gDad.getPhredScaledQual() < minGenotypeQuality || + gChild.getPhredScaledQual() < minGenotypeQuality )) { + lowQual++; + } + else{ + //Count all families per loci called + familyCalled++; + //If the family is all homref, not too interesting + if(!(gMom.isHomRef() && gDad.isHomRef() && gChild.isHomRef())) + { + varFamilyCalled++; + if(isViolation(gMom, gDad, gChild)){ + violationFamilies.add(familyId); + violations_total++; + } + } + count = inheritance.get(gMom.getType()).get(gDad.getType()).get(gChild.getType()); + inheritance.get(gMom.getType()).get(gDad.getType()).put(gChild.getType(),count+1); + + } } - /** - * @return whether or not there is a mendelian violation at the site. - */ - public boolean isViolation() { - if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) || - allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0))) - return false; - return true; + private boolean isViolation(Genotype gMom, Genotype gDad, Genotype gChild) { + //1 parent is no "call + if(!gMom.isCalled()){ + return (gDad.isHomRef() && gChild.isHomVar()) || (gDad.isHomVar() && gChild.isHomRef()); + } + else if(!gDad.isCalled()){ + return (gMom.isHomRef() && gChild.isHomVar()) || (gMom.isHomVar() && gChild.isHomRef()); + } + //Both parents have genotype information + return !(gMom.getAlleles().contains(gChild.getAlleles().get(0)) && gDad.getAlleles().contains(gChild.getAlleles().get(1)) || + gMom.getAlleles().contains(gChild.getAlleles().get(1)) && gDad.getAlleles().contains(gChild.getAlleles().get(0))); + } + + private void createInheritanceMap(){ + + inheritance = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mType : Genotype.Type.values()){ + inheritance.put(mType, new EnumMap>(Genotype.Type.class)); + for(Genotype.Type dType : Genotype.Type.values()){ + inheritance.get(mType).put(dType, new EnumMap(Genotype.Type.class)); + for(Genotype.Type cType : Genotype.Type.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } + + } + + private void clearInheritanceMap(){ + for(Genotype.Type mType : Genotype.Type.values()){ + for(Genotype.Type dType : Genotype.Type.values()){ + for(Genotype.Type cType : Genotype.Type.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } } /** * @return the likelihood ratio for a mendelian violation */ - public double violationLikelihoodRatio(VariantContext vc) { + public double violationLikelihoodRatio(VariantContext vc, String motherId, String fatherId, String childId) { double[] logLikAssignments = new double[27]; // the matrix to set up is // MOM DAD CHILD @@ -152,9 +399,9 @@ public double violationLikelihoodRatio(VariantContext vc) { // AA AB | AB // |- BB // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs - double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector(); - double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector(); - double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector(); + double[] momGL = vc.getGenotype(motherId).getLikelihoods().getAsVector(); + double[] dadGL = vc.getGenotype(fatherId).getLikelihoods().getAsVector(); + double[] childGL = vc.getGenotype(childId).getLikelihoods().getAsVector(); int offset = 0; for ( int oMom = 0; oMom < 3; oMom++ ) { for ( int oDad = 0; oDad < 3; oDad++ ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java index d8176ff4e5..d753da1c87 100644 --- a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java +++ b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java @@ -109,7 +109,7 @@ public boolean exec() { List tempFiles = new ArrayList(); try { - File tempLibDir = IOUtils.tempDir("R.", ".lib"); + File tempLibDir = IOUtils.tempDir("Rlib.", ""); tempFiles.add(tempLibDir); StringBuilder expression = new StringBuilder("tempLibDir = '").append(tempLibDir).append("';"); diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index edc1413ba4..68b220aabd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -150,7 +150,7 @@ public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Se // iterate to get all of the sample names - for ( Map.Entry pair : VCFUtils.getVCFHeadersFromRods(toolkit, null).entrySet() ) { + for ( Map.Entry pair : VCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) { Set vcfSamples = pair.getValue().getGenotypeSamples(); for ( String sample : vcfSamples ) addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index f0eb5d3999..10bc050dac 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -388,6 +388,15 @@ public static int[] indexOfAll(String s, int ch) { return reallocate(pos, z); } + public static int countSetBits(boolean[] array) { + int counter = 0; + for ( int i = 0; i < array.length; i++ ) { + if ( array[i] ) + counter++; + } + return counter; + } + /** * Returns new (reallocated) integer array of the specified size, with content * of the original array orig copied into it. If newSize is @@ -645,4 +654,18 @@ public static final String resolveHostname() { // handle exception } } + + + public static byte [] arrayFromArrayWithLength(byte[] array, int length) { + byte [] output = new byte[length]; + for (int j = 0; j < length; j++) + output[j] = array[(j % array.length)]; + return output; + } + + public static void fillArrayWithByte(byte[] array, byte value) { + for (int i=0; i 0) { + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + matchesCount = 0; + unclippedCigar.add(element); + } + else + unclippedCigar.add(element); + } + if (matchesCount > 0) + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + + unclipped.setCigar(unclippedCigar); + unclipped.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar)); + + return unclipped; + } + /** * Given a cigar string, get the number of bases hard or soft clipped at the start */ @@ -247,10 +283,12 @@ private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int return newCigar; } - @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"}) + @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { if (start == 0 && stop == read.getReadLength() - 1) - return new GATKSAMRecord(read.getHeader()); + return GATKSAMRecord.emptyRead(read); +// return new GATKSAMRecord(read.getHeader()); + // If the read is unmapped there is no Cigar string and neither should we create a new cigar string CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); @@ -373,6 +411,10 @@ else if (index + shift > stop + 1) { while(cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); + + // if the read had a HardClip operator in the end, combine it with the Hard Clip we are adding + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClipCount += cigarElement.getLength(); } newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); } @@ -456,17 +498,20 @@ private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) { int newShift = 0; int oldShift = 0; + boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift for (CigarElement cigarElement : newCigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) newShift += cigarElement.getLength(); - else + else { + readHasStarted = true; break; + } } for (CigarElement cigarElement : oldCigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP ) - oldShift += Math.min(cigarElement.getLength(), newShift - oldShift); - else + oldShift += cigarElement.getLength(); + else if (readHasStarted) break; } return newShift - oldShift; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java similarity index 86% rename from public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java rename to public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java index d574ba2f06..f0765665aa 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.utils.clipreads; +package org.broadinstitute.sting.utils.clipping; /** * How should we represent a clipped bases in a read? @@ -29,5 +29,10 @@ public enum ClippingRepresentation { * lossy) operation. Note that this can only be applied to cases where the clipped * bases occur at the start or end of a read. */ - HARDCLIP_BASES + HARDCLIP_BASES, + + /** + * Turn all soft-clipped bases into matches + */ + REVERT_SOFTCLIPPED_BASES, } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java new file mode 100644 index 0000000000..afe7fa9753 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -0,0 +1,406 @@ +package org.broadinstitute.sting.utils.clipping; + +import com.google.java.contract.Requires; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * A comprehensive clipping tool. + * + * General Contract: + * - All clipping operations return a new read with the clipped bases requested, it never modifies the original read. + * - If a read is fully clipped, return an empty GATKSAMRecord, never null. + * - When hard clipping, add cigar operator H for every *reference base* removed (i.e. Matches, SoftClips and Deletions, but *not* insertions). See Hard Clipping notes for details. + * + * + * There are several types of clipping to use: + * + * Write N's: + * Change the bases to N's in the desired region. This can be applied anywhere in the read. + * + * Write Q0's: + * Change the quality of the bases in the desired region to Q0. This can be applied anywhere in the read. + * + * Write both N's and Q0's: + * Same as the two independent operations, put together. + * + * Soft Clipping: + * Do not change the read, just mark the reads as soft clipped in the Cigar String + * and adjust the alignment start and end of the read. + * + * Hard Clipping: + * Creates a new read without the hard clipped bases (and base qualities). The cigar string + * will be updated with the cigar operator H for every reference base removed (i.e. Matches, + * Soft clipped bases and deletions, but *not* insertions). This contract with the cigar + * is necessary to allow read.getUnclippedStart() / End() to recover the original alignment + * of the read (before clipping). + * + */ +public class ReadClipper { + final GATKSAMRecord read; + boolean wasClipped; + List ops = null; + + /** + * Initializes a ReadClipper object. + * + * You can set up your clipping operations using the addOp method. When you're ready to + * generate a new read with all the clipping operations, use clipRead(). + * + * Note: Use this if you want to set up multiple operations on the read using the ClippingOp + * class. If you just want to apply one of the typical modes of clipping, use the static + * clipping functions available in this class instead. + * + * @param read the read to clip + */ + public ReadClipper(final GATKSAMRecord read) { + this.read = read; + this.wasClipped = false; + } + + /** + * Add clipping operation to the read. + * + * You can add as many operations as necessary to this read before clipping. Beware that the + * order in which you add these operations matter. For example, if you hard clip the beginning + * of a read first then try to hard clip the end, the indices will have changed. Make sure you + * know what you're doing, otherwise just use the static functions below that take care of the + * ordering for you. + * + * Note: You only choose the clipping mode when you use clipRead() + * + * @param op a ClippingOp object describing the area you want to clip. + */ + public void addOp(ClippingOp op) { + if (ops == null) ops = new ArrayList(); + ops.add(op); + } + + /** + * Check the list of operations set up for this read. + * + * @return a list of the operations set up for this read. + */ + public List getOps() { + return ops; + } + + /** + * Check whether or not this read has been clipped. + * @return true if this read has produced a clipped read, false otherwise. + */ + public boolean wasClipped() { + return wasClipped; + } + + /** + * The original read. + * + * @return returns the read to be clipped (original) + */ + public GATKSAMRecord getRead() { + return read; + } + + /** + * Creates a new read that's been clipped according to ops and the chosen algorithm. + * The original read is unmodified. + * + * @param algorithm What mode of clipping do you want to apply for the stacked operations. + * @return a new read with the clipping applied. + */ + public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { + if (ops == null) + return getRead(); + else { + try { + GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); + for (ClippingOp op : getOps()) { + //check if the clipped read can still be clipped in the range requested + if (op.start < clippedRead.getReadLength()) { + ClippingOp fixedOperation = op; + if (op.stop >= clippedRead.getReadLength()) + fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); + + clippedRead = fixedOperation.apply(algorithm, clippedRead); + } + } + wasClipped = true; + ops.clear(); + if ( clippedRead.isEmpty() ) + return GATKSAMRecord.emptyRead(clippedRead); +// return new GATKSAMRecord( clippedRead.getHeader() ); + return clippedRead; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); // this should never happen + } + } + } + + + /** + * Hard clips the left tail of a read up to (and including) refStop using reference + * coordinates. + * + * @param refStop the last base to be hard clipped in the left tail of the read. + * @return a new read, without the left tail. + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + private GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { + return hardClipByReferenceCoordinates(-1, refStop); + } + public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop); + } + + + + /** + * Hard clips the right tail of a read starting at (and including) refStart using reference + * coordinates. + * + * @param refStart refStop the first base to be hard clipped in the right tail of the read. + * @return a new read, without the right tail. + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + private GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) { + return hardClipByReferenceCoordinates(refStart, -1); + } + public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1); + } + + /** + * Hard clips a read using read coordinates. + * + * @param start the first base to clip (inclusive) + * @param stop the last base to clip (inclusive) + * @return a new read, without the clipped bases + */ + @Requires({"start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read + "start == 0 || stop == read.getReadLength() - 1"}) // cannot clip the middle of the read + private GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { + if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) + return GATKSAMRecord.emptyRead(read); +// return new GATKSAMRecord(read.getHeader()); + + this.addOp(new ClippingOp(start, stop)); + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) { + return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop); + } + + + /** + * Hard clips both tails of a read. + * Left tail goes from the beginning to the 'left' coordinate (inclusive) + * Right tail goes from the 'right' coordinate (inclusive) until the end of the read + * + * @param left the coordinate of the last base to be clipped in the left tail (inclusive) + * @param right the coordinate of the first base to be clipped in the right tail (inclusive) + * @return a new read, without the clipped bases + */ + @Requires({"left <= right", // tails cannot overlap + "left >= read.getAlignmentStart()", // coordinate has to be within the mapped read + "right <= read.getAlignmentEnd()"}) // coordinate has to be within the mapped read + private GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { + if (read.isEmpty() || left == right) + return GATKSAMRecord.emptyRead(read); +// return new GATKSAMRecord(read.getHeader()); + GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); + + // after clipping one tail, it is possible that the consequent hard clipping of adjacent deletions + // make the left cut index no longer part of the read. In that case, clip the read entirely. + if (left > leftTailRead.getAlignmentEnd()) + return GATKSAMRecord.emptyRead(read); +// return new GATKSAMRecord(read.getHeader()); + + ReadClipper clipper = new ReadClipper(leftTailRead); + return clipper.hardClipByReferenceCoordinatesLeftTail(left); + } + public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) { + return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right); + } + + + /** + * Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual. + * + * This function will look for low quality tails and hard clip them away. A low quality tail + * ends when a base has base quality greater than lowQual. + * + * @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped + * @return a new read without low quality tails + */ + private GATKSAMRecord hardClipLowQualEnds(byte lowQual) { + if (read.isEmpty()) + return read; + + byte [] quals = read.getBaseQualities(); + int leftClipIndex = 0; + int rightClipIndex = read.getReadLength() - 1; + + // check how far we can clip both sides + while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; + while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++; + + // if the entire read should be clipped, then return an empty read. + if (leftClipIndex > rightClipIndex) + return GATKSAMRecord.emptyRead(read); +// return (new GATKSAMRecord(read.getHeader())); + + if (rightClipIndex < read.getReadLength() - 1) { + this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1)); + } + if (leftClipIndex > 0 ) { + this.addOp(new ClippingOp(0, leftClipIndex - 1)); + } + return this.clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { + return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); + } + + + /** + * Will hard clip every soft clipped bases in the read. + * + * @return a new read without the soft clipped bases + */ + private GATKSAMRecord hardClipSoftClippedBases () { + if (read.isEmpty()) + return read; + + int readIndex = 0; + int cutLeft = -1; // first position to hard clip (inclusive) + int cutRight = -1; // first position to hard clip (inclusive) + boolean rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail + + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (rightTail) { + cutRight = readIndex; + } + else { + cutLeft = readIndex + cigarElement.getLength() - 1; + } + } + else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) + rightTail = true; + + if (cigarElement.getOperator().consumesReadBases()) + readIndex += cigarElement.getLength(); + } + + // It is extremely important that we cut the end first otherwise the read coordinates change. + if (cutRight >= 0) + this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1)); + if (cutLeft >= 0) + this.addOp(new ClippingOp(0, cutLeft)); + + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipSoftClippedBases(); + } + + + /** + * Checks if a read contains adaptor sequences. If it does, hard clips them out. + * + * Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary() + * + * @return a new read without adaptor sequence + */ + private GATKSAMRecord hardClipAdaptorSequence () { + final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read); + + if (adaptorBoundary == null || !ReadUtils.isInsideRead(read, adaptorBoundary)) + return read; + + return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); + } + public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipAdaptorSequence(); + } + + + /** + * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the end. + * + * @return a new read without leading insertions + */ + private GATKSAMRecord hardClipLeadingInsertions() { + if (read.isEmpty()) + return read; + + for(CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP && + cigarElement.getOperator() != CigarOperator.INSERTION) + break; + + else if (cigarElement.getOperator() == CigarOperator.INSERTION) + this.addOp(new ClippingOp(0, cigarElement.getLength() - 1)); + + } + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipLeadingInsertions(); + } + + + /** + * Turns soft clipped bases into matches + * + * @return a new read with every soft clip turned into a match + */ + private GATKSAMRecord revertSoftClippedBases() { + this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates + return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); + } + public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { + return (new ReadClipper(read)).revertSoftClippedBases(); + } + + /** + * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail + * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. + * + * @param refStart first base to clip (inclusive) + * @param refStop last base to clip (inclusive) + * @return a new read, without the clipped bases + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); + int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + + if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) + return GATKSAMRecord.emptyRead(read); +// return new GATKSAMRecord(read.getHeader()); + + if (start < 0 || stop > read.getReadLength() - 1) + throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); + + if ( start > stop ) + throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); + + if ( start > 0 && stop < read.getReadLength() - 1) + throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); + + this.addOp(new ClippingOp(start, stop)); + GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); + this.ops = null; + return clippedRead; + } + + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java deleted file mode 100644 index a6df986ba4..0000000000 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ /dev/null @@ -1,200 +0,0 @@ -package org.broadinstitute.sting.utils.clipreads; - -import com.google.java.contract.Requires; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.ArrayList; -import java.util.List; - -/** - * A simple collection of the clipping operations to apply to a read along with its read - */ -public class ReadClipper { - GATKSAMRecord read; - boolean wasClipped; - List ops = null; - - /** - * We didn't do any clipping work on this read, just leave everything as a default - * - * @param read - */ - public ReadClipper(final GATKSAMRecord read) { - this.read = read; - this.wasClipped = false; - } - - /** - * Add another clipping operation to apply to this read - * - * @param op - */ - public void addOp(ClippingOp op) { - if (ops == null) ops = new ArrayList(); - ops.add(op); - } - - public List getOps() { - return ops; - } - - public boolean wasClipped() { - return wasClipped; - } - - public GATKSAMRecord getRead() { - return read; - } - - public GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { - return hardClipByReferenceCoordinates(-1, refStop); - } - - public GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) { - return hardClipByReferenceCoordinates(refStart, -1); - } - - private int numDeletions(GATKSAMRecord read) { - int result = 0; - for (CigarElement e: read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.DELETION || e.getOperator() == CigarOperator.D ) - result =+ e.getLength(); - } - return result; - } - - protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { - int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); - int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); - - if (start < 0 || stop > read.getReadLength() - 1) - throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); - - if ( start > stop ) - throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); - - this.addOp(new ClippingOp(start, stop)); - GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); - this.ops = null; - return clippedRead; - } - - public GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { - this.addOp(new ClippingOp(start, stop)); - return clipRead(ClippingRepresentation.HARDCLIP_BASES); - } - - @Requires("left <= right") - public GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { - if (left == right) - return new GATKSAMRecord(read.getHeader()); - GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); - - // after clipping one tail, it is possible that the consequent hard clipping of adjacent deletions - // make the left cut index no longer part of the read. In that case, clip the read entirely. - if (left > leftTailRead.getAlignmentEnd()) - return new GATKSAMRecord(read.getHeader()); - - ReadClipper clipper = new ReadClipper(leftTailRead); - return clipper.hardClipByReferenceCoordinatesLeftTail(left); - } - - public GATKSAMRecord hardClipLowQualEnds(byte lowQual) { - byte [] quals = read.getBaseQualities(); - int leftClipIndex = 0; - int rightClipIndex = read.getReadLength() - 1; - - // check how far we can clip both sides - while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; - while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++; - - // if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now) - if (leftClipIndex > rightClipIndex) - return (new GATKSAMRecord(read.getHeader())); - - if (rightClipIndex < read.getReadLength() - 1) { - this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1)); - } - if (leftClipIndex > 0 ) { - this.addOp(new ClippingOp(0, leftClipIndex - 1)); - } - return this.clipRead(ClippingRepresentation.HARDCLIP_BASES); - } - - public GATKSAMRecord hardClipSoftClippedBases () { - int readIndex = 0; - int cutLeft = -1; // first position to hard clip (inclusive) - int cutRight = -1; // first position to hard clip (inclusive) - boolean rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail - - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { - if (rightTail) { - cutRight = readIndex; - } - else { - cutLeft = readIndex + cigarElement.getLength() - 1; - } - } - else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) - rightTail = true; - - if (cigarElement.getOperator().consumesReadBases()) - readIndex += cigarElement.getLength(); - } - - // It is extremely important that we cut the end first otherwise the read coordinates change. - if (cutRight >= 0) - this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1)); - if (cutLeft >= 0) - this.addOp(new ClippingOp(0, cutLeft)); - - return clipRead(ClippingRepresentation.HARDCLIP_BASES); - } - - - - /** - * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. - * - * @param algorithm - * @return - */ - public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { - if (ops == null) - return getRead(); - else { - try { - GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); - for (ClippingOp op : getOps()) { - clippedRead = op.apply(algorithm, clippedRead); - } - wasClipped = true; - return clippedRead; - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // this should never happen - } - } - } - - public GATKSAMRecord hardClipLeadingInsertions() { - for(CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP && - cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.DELETION) - break; - - else if (cigarElement.getOperator() == CigarOperator.INSERTION) { - this.addOp(new ClippingOp(0, cigarElement.getLength() - 1)); - } - - else if (cigarElement.getOperator() == CigarOperator.DELETION) { - throw new ReviewedStingException("No read should start with a deletion. Aligner bug?"); - } - } - return clipRead(ClippingRepresentation.HARDCLIP_BASES); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java index 413848543d..e4768fd5b9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java @@ -249,4 +249,6 @@ else if (readerType == BeagleReaderType.GENOTYPES) { return bglFeature; } + public boolean canDecode(final String potentialInput) { return false; } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index a80e05d599..8bdb24b6c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -24,8 +24,8 @@ package org.broadinstitute.sting.utils.codecs.hapmap; +import org.broad.tribble.AbstractFeatureCodec; import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; import org.broad.tribble.annotation.Strand; import org.broad.tribble.readers.LineReader; @@ -71,7 +71,7 @@ * @author Mark DePristo * @since 2010 */ -public class RawHapMapCodec implements FeatureCodec { +public class RawHapMapCodec extends AbstractFeatureCodec { // the minimum number of features in the HapMap file line private static final int minimumFeatureCount = 11; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index f142fa5aaf..efcd3ecf00 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -138,4 +138,7 @@ public Object readHeader(LineReader reader) { public Class getFeatureType() { return RefSeqFeature.class; } + + public boolean canDecode(final String potentialInput) { return false; } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java index c04ca85926..a86d4781fd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java @@ -6,8 +6,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * the ref seq feature @@ -111,6 +110,34 @@ public String getOverlapString(GenomeLoc position) { return overlapString.toString(); } + ArrayList exonInRefOrderCache = null; + + public Integer getSortedOverlapInteger(GenomeLoc position) { + int exonNo = -1; + ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); + if ( exonInRefOrderCache == null ) { + Collections.sort(exonsInReferenceOrder); + } + exonInRefOrderCache = exonsInReferenceOrder; + for ( GenomeLoc exon : exonsInReferenceOrder ) { + if ( exon.overlapsP(position) ) { + return ++exonNo; + } + ++exonNo; + } + + return -1; + } + + public GenomeLoc getSortedExonLoc(int offset) { + ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); + if ( exonInRefOrderCache == null ) { + Collections.sort(exonsInReferenceOrder); + } + exonInRefOrderCache = exonsInReferenceOrder; + return exonsInReferenceOrder.get(offset); + } + /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ public boolean overlapsP (GenomeLoc that) { return getLocation().overlapsP(that); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java index f4633b2ce7..d9f16c3535 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -25,8 +25,8 @@ package org.broadinstitute.sting.utils.codecs.sampileup; +import org.broad.tribble.AbstractFeatureCodec; import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; import org.broad.tribble.exception.CodecLineParsingException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; @@ -76,7 +76,7 @@ * @author Matt Hanna * @since 2009 */ -public class SAMPileupCodec implements FeatureCodec { +public class SAMPileupCodec extends AbstractFeatureCodec { // the number of tokens we expect to parse from a pileup line private static final int expectedTokenCount = 10; private static final char fldDelim = '\t'; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java index d4bdb5aa9b..0f2b94e630 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java @@ -27,8 +27,8 @@ import net.sf.samtools.Cigar; import net.sf.samtools.TextCigarCodec; import net.sf.samtools.util.StringUtil; +import org.broad.tribble.AbstractFeatureCodec; import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; import org.broad.tribble.exception.CodecLineParsingException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; @@ -52,7 +52,7 @@ * @author Matt Hanna * @since 2009 */ -public class SAMReadCodec implements FeatureCodec { +public class SAMReadCodec extends AbstractFeatureCodec { /* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */ // the number of tokens we expect to parse from a read line diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java index 4082a5597f..aa6d7d3457 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java @@ -88,7 +88,6 @@ public Object readHeader(LineReader reader) { try { boolean isFirst = true; while ((line = reader.readLine()) != null) { - System.out.println(line); if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) { throw new UserException.MalformedFile("TableCodec file does not have a header"); } @@ -107,4 +106,7 @@ public Object readHeader(LineReader reader) { } return header; } + + public boolean canDecode(final String potentialInput) { return false; } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 0e0cb14bfe..e44c10f1f2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -8,19 +8,16 @@ import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.BlockCompressedInputStream; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; -public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser, SelfScopingFeatureCodec { +public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { protected final static Logger log = Logger.getLogger(VCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column @@ -62,6 +59,29 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, protected Map stringCache = new HashMap(); + /** + * Creates a LazyParser for a LazyGenotypesContext to use to decode + * our genotypes only when necessary. We do this instead of eagarly + * decoding the genotypes just to turn around and reencode in the frequent + * case where we don't actually want to manipulate the genotypes + */ + class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { + final List alleles; + final String contig; + final int start; + + LazyVCFGenotypesParser(final List alleles, final String contig, final int start) { + this.alleles = alleles; + this.contig = contig; + this.start = start; + } + + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + //System.out.printf("Loading genotypes... %s:%d%n", contig, start); + return createGenotypeMap((String) data, alleles, contig, start); + } + } /** * @param reader the line reader to take header lines from @@ -71,13 +91,14 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @param chr chrom * @param pos position * @return a mapping of sample name to genotype object */ - public abstract Map createGenotypeMap(String str, List alleles, String chr, int pos); + public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos); /** @@ -98,7 +119,7 @@ protected Object createHeader(List headerStrings, String line) { headerStrings.add(line); Set metaData = new TreeSet(); - Set auxTags = new LinkedHashSet(); + Set sampleNames = new LinkedHashSet(); // iterate over all the passed in strings for ( String str : headerStrings ) { if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { @@ -126,9 +147,9 @@ protected Object createHeader(List headerStrings, String line) { } while ( arrayIndex < strings.length ) - auxTags.add(strings[arrayIndex++]); + sampleNames.add(strings[arrayIndex++]); - if ( sawFormatTag && auxTags.size() == 0 ) + if ( sawFormatTag && sampleNames.size() == 0 ) throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data"); } else { @@ -152,7 +173,8 @@ protected Object createHeader(List headerStrings, String line) { } } - header = new VCFHeader(metaData, auxTags); + header = new VCFHeader(metaData, sampleNames); + header.buildVCFReaderMaps(new ArrayList(sampleNames)); return header; } @@ -162,7 +184,6 @@ protected Object createHeader(List headerStrings, String line) { * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - lineNo++; // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -253,29 +274,35 @@ protected static void generateException(String message, int lineNo) { * @return a variant context object */ private VariantContext parseVCFLine(String[] parts) { + VariantContextBuilder builder = new VariantContextBuilder(); + builder.source(getName()); + // increment the line count + // TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when + // TODO -- the first record is far along the contig) and the line counter can get out of sync lineNo++; // parse out the required fields - String contig = getCachedString(parts[0]); + final String chr = getCachedString(parts[0]); + builder.chr(chr); int pos = Integer.valueOf(parts[1]); - String id = null; + builder.start(pos); + if ( parts[2].length() == 0 ) generateException("The VCF specification requires a valid ID field"); else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) - id = VCFConstants.EMPTY_ID_FIELD; + builder.noID(); else - id = new String(parts[2]); + builder.id(parts[2]); + String ref = getCachedString(parts[3].toUpperCase()); String alts = getCachedString(parts[4].toUpperCase()); - Double qual = parseQual(parts[5]); - String filter = getCachedString(parts[6]); - String info = new String(parts[7]); + builder.log10PError(parseQual(parts[5])); + builder.filters(parseFilters(getCachedString(parts[6]))); + builder.attributes(parseInfo(parts[7])); // get our alleles, filters, and setup an attribute map List alleles = parseAlleles(ref, alts, lineNo); - Set filters = parseFilters(filter); - Map attributes = parseInfo(info, id); // find out our current location, and clip the alleles down to their minimum length int loc = pos; @@ -287,23 +314,30 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); alleles = newAlleles; } + builder.stop(loc); + builder.alleles(alleles); // do we have genotyping data if (parts.length > NUM_STANDARD_FIELDS) { - attributes.put(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, new String(parts[8])); - attributes.put(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY, this); + final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); + final int nGenotypes = header.getGenotypeSamples().size(); + LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); + + // did we resort the sample names? If so, we need to load the genotype data + if ( !header.samplesWereAlreadySorted() ) + lazy.decode(); + + builder.genotypesNoValidation(lazy); } VariantContext vc = null; try { - vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]); + builder.referenceBaseForIndel(ref.getBytes()[0]); + vc = builder.make(); } catch (Exception e) { generateException(e.getMessage()); } - // did we resort the sample names? If so, we need to load the genotype data - if ( !header.samplesWereAlreadySorted() ) - vc.getGenotypes(); return vc; } @@ -350,10 +384,9 @@ protected String getCachedString(String str) { /** * parse out the info fields * @param infoField the fields - * @param id the indentifier * @return a mapping of keys to objects */ - private Map parseInfo(String infoField, String id) { + private Map parseInfo(String infoField) { Map attributes = new HashMap(); if ( infoField.length() == 0 ) @@ -392,8 +425,6 @@ private Map parseInfo(String infoField, String id) { } } - if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) ) - attributes.put(VariantContext.ID_KEY, id); return attributes; } @@ -445,16 +476,16 @@ protected static List parseGenotypeAlleles(String GT, List allel protected static Double parseQual(String qualString) { // if we're the VCF 4 missing char, return immediately if ( qualString.equals(VCFConstants.MISSING_VALUE_v4)) - return VariantContext.NO_NEG_LOG_10PERROR; + return VariantContext.NO_LOG10_PERROR; Double val = Double.valueOf(qualString); // check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON)) - return VariantContext.NO_NEG_LOG_10PERROR; + return VariantContext.NO_LOG10_PERROR; // scale and return the value - return val / 10.0; + return val / -10.0; } /** @@ -564,6 +595,11 @@ protected static int computeReverseClipping(List unclippedAlleles, Strin if ( a.isSymbolic() ) continue; + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - 1; + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) stillClipping = false; else if ( ref.length() == clipping ) @@ -608,7 +644,7 @@ protected static int clipAlleles(int position, String ref, List unclippe return position+Math.max(refLength - 1,0); } - public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) { + public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) { try { return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) || diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 0da7a100fd..ac1da7110c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -25,18 +25,10 @@ package org.broadinstitute.sting.utils.codecs.vcf; import net.sf.samtools.SAMSequenceDictionary; -import org.broad.tribble.Tribble; import org.broad.tribble.TribbleException; -import org.broad.tribble.index.DynamicIndexCreator; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; import org.broad.tribble.util.ParsingUtils; -import org.broad.tribble.util.PositionalStream; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.lang.reflect.Array; @@ -164,10 +156,10 @@ public void add(VariantContext vc) { throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName()); if ( doNotWriteGenotypes ) - vc = VariantContext.modifyGenotypes(vc, null); + vc = new VariantContextBuilder(vc).noGenotypes().make(); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false); + vc = VariantContextUtils.createVariantContextWithPaddedAlleles(vc, false); super.add(vc); Map alleleMap = new HashMap(vc.getAlleles().size()); @@ -182,7 +174,7 @@ public void add(VariantContext vc) { mWriter.write(VCFConstants.FIELD_SEPARATOR); // ID - String ID = vc.hasID() ? vc.getID() : VCFConstants.EMPTY_ID_FIELD; + String ID = vc.getID(); mWriter.write(ID); mWriter.write(VCFConstants.FIELD_SEPARATOR); @@ -212,7 +204,7 @@ public void add(VariantContext vc) { mWriter.write(VCFConstants.FIELD_SEPARATOR); // QUAL - if ( !vc.hasNegLog10PError() ) + if ( !vc.hasLog10PError() ) mWriter.write(VCFConstants.MISSING_VALUE_v4); else mWriter.write(getQualValue(vc.getPhredScaledQual())); @@ -227,9 +219,6 @@ public void add(VariantContext vc) { Map infoFields = new TreeMap(); for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) - continue; - String outputValue = formatVCFField(field.getValue()); if ( outputValue != null ) infoFields.put(key, outputValue); @@ -237,9 +226,10 @@ public void add(VariantContext vc) { writeInfoString(infoFields); // FORMAT - if ( vc.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) ) { + final GenotypesContext gc = vc.getGenotypes(); + if ( gc instanceof LazyGenotypesContext && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() != null) { mWriter.write(VCFConstants.FIELD_SEPARATOR); - mWriter.write(vc.getAttributeAsString(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, "")); + mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString()); } else { List genotypeAttributeKeys = new ArrayList(); if ( vc.hasGenotypes() ) { @@ -361,7 +351,7 @@ private void addGenotypeData(VariantContext vc, Map alleleMap, L // some exceptions if ( key.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { - if ( Math.abs(g.getNegLog10PError() - Genotype.NO_NEG_LOG_10PERROR) < 1e-6) + if ( ! g.hasLog10PError() ) val = VCFConstants.MISSING_VALUE_v4; else { val = getQualValue(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); @@ -451,11 +441,11 @@ private static List calcVCFGenotypeKeys(VariantContext vc) { boolean sawGoodGT = false; boolean sawGoodQual = false; boolean sawGenotypeFilter = false; - for ( Genotype g : vc.getGenotypes().values() ) { + for ( final Genotype g : vc.getGenotypes() ) { keys.addAll(g.getAttributes().keySet()); if ( g.isAvailable() ) sawGoodGT = true; - if ( g.hasNegLog10PError() ) + if ( g.hasLog10PError() ) sawGoodQual = true; if (g.isFiltered() && g.isCalled()) sawGenotypeFilter = true; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index e5b1a2de58..b3329c708b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -3,12 +3,8 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; -import java.io.File; -import java.io.FileReader; import java.io.IOException; import java.util.*; @@ -112,19 +108,22 @@ protected Set parseFilters(String filterString) { /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @param chr chrom * @param pos position * @return a mapping of sample name to genotype object */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos) { + public LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos) { if (genotypeParts == null) genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); - Map genotypes = new LinkedHashMap(nParts); + ArrayList genotypes = new ArrayList(nParts); // get the format keys int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); @@ -139,9 +138,9 @@ public Map createGenotypeMap(String str, List alleles, for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + double GTQual = VariantContext.NO_LOG10_PERROR; Set genotypeFilters = null; - Map gtAttributes = null; + Map gtAttributes = null; String sampleName = sampleNameIterator.next(); // check to see if the value list is longer than the key list, which is a problem @@ -150,7 +149,7 @@ public Map createGenotypeMap(String str, List alleles, int genotypeAlleleLocation = -1; if (nGTKeys >= 1) { - gtAttributes = new HashMap(nGTKeys - 1); + gtAttributes = new HashMap(nGTKeys - 1); for (int i = 0; i < nGTKeys; i++) { final String gtKey = new String(genotypeKeyArray[i]); @@ -180,7 +179,7 @@ public Map createGenotypeMap(String str, List alleles, // add it to the list try { - genotypes.put(sampleName, new Genotype(sampleName, + genotypes.add(new Genotype(sampleName, parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap), GTQual, genotypeFilters, @@ -191,11 +190,11 @@ public Map createGenotypeMap(String str, List alleles, } } - return genotypes; + return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); } @Override - public boolean canDecode(final File potentialInput) { + public boolean canDecode(final String potentialInput) { return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 42ea05355b..453155be7e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -3,12 +3,8 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; -import java.io.File; -import java.io.FileReader; import java.io.IOException; import java.util.*; @@ -141,17 +137,20 @@ public static Set parseFilters(final Map> /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @return a mapping of sample name to genotype object */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos) { + public LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos) { if (genotypeParts == null) genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); + if ( nParts != genotypeParts.length ) + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); - Map genotypes = new LinkedHashMap(nParts); + ArrayList genotypes = new ArrayList(nParts); // get the format keys int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); @@ -166,9 +165,9 @@ public Map createGenotypeMap(String str, List alleles, for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - double GTQual = VariantContext.NO_NEG_LOG_10PERROR; + double GTQual = VariantContext.NO_LOG10_PERROR; Set genotypeFilters = null; - Map gtAttributes = null; + Map gtAttributes = null; String sampleName = sampleNameIterator.next(); // check to see if the value list is longer than the key list, which is a problem @@ -177,7 +176,7 @@ public Map createGenotypeMap(String str, List alleles, int genotypeAlleleLocation = -1; if (nGTKeys >= 1) { - gtAttributes = new HashMap(nGTKeys - 1); + gtAttributes = new HashMap(nGTKeys - 1); for (int i = 0; i < nGTKeys; i++) { final String gtKey = new String(genotypeKeyArray[i]); @@ -209,23 +208,17 @@ public Map createGenotypeMap(String str, List alleles, // add it to the list try { - genotypes.put(sampleName, - new Genotype(sampleName, - GTalleles, - GTQual, - genotypeFilters, - gtAttributes, - phased)); + genotypes.add(new Genotype(sampleName, GTalleles, GTQual, genotypeFilters, gtAttributes, phased)); } catch (TribbleException e) { throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); } } - return genotypes; + return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); } @Override - public boolean canDecode(final File potentialInput) { + public boolean canDecode(final String potentialInput) { return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 66e11bc1e2..5c5df15ab8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -2,6 +2,7 @@ import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.variantcontext.Genotype; import java.util.*; @@ -38,6 +39,10 @@ public enum HEADER_FIELDS { // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; + // cache for efficient conversion of VCF -> VariantContext + protected ArrayList sampleNamesInOrder = null; + protected HashMap sampleNameToOffset = null; + /** * create a VCF header, given a list of meta data and auxillary tags @@ -69,6 +74,27 @@ public VCFHeader(Set metaData, Set genotypeSampleNames) { samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames); } + /** + * Tell this VCF header to use pre-calculated sample name ordering and the + * sample name -> offset map. This assumes that all VariantContext created + * using this header (i.e., read by the VCFCodec) will have genotypes + * occurring in the same order + * + */ + + protected void buildVCFReaderMaps(List genotypeSampleNamesInAppearenceOrder) { + sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); + sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); + + int i = 0; + for ( final String name : genotypeSampleNamesInAppearenceOrder ) { + sampleNamesInOrder.add(name); + sampleNameToOffset.put(name, i++); + } + Collections.sort(sampleNamesInOrder); + } + + /** * Adds a header line to the header metadata. * diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java deleted file mode 100755 index 1dba351e2a..0000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.broadinstitute.sting.utils.codecs.vcf; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -import java.util.List; -import java.util.Map; - - -/** - * All VCF codecs need to implement this interface so that we can perform lazy loading. - */ -public interface VCFParser { - - /** - * create a genotype map - * @param str the string - * @param alleles the list of alleles - * @param chr chrom - * @param pos position - * @return a mapping of sample name to genotype object - */ - public Map createGenotypeMap(String str, List alleles, String chr, int pos); - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 2d8421507a..5bd6a9b32a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -41,6 +43,18 @@ public class VCFUtils { */ private VCFUtils() { } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { + // Collect the eval rod names + final Set names = new TreeSet(); + for ( final RodBinding evalRod : rodBindings ) + names.add(evalRod.getName()); + return getVCFHeadersFromRods(toolkit, names); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) { + return getVCFHeadersFromRods(toolkit, (Collection)null); + } + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { Map data = new HashMap(); diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index a208d2dc0f..a2816b58f8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -100,6 +100,12 @@ public BadTmpDir(String message) { } } + public static class TooManyOpenFiles extends UserException { + public TooManyOpenFiles() { + super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); + } + } + public static class ErrorWritingBamFile extends UserException { public ErrorWritingBamFile(String message) { super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); @@ -178,11 +184,11 @@ public MalformedBAM(String source, String message) { public static class MalformedVCF extends UserException { public MalformedVCF(String message, String line) { - super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + super(String.format("The provided VCF file is malformed at approximately line %s: %s", line, message)); } public MalformedVCF(String message, int lineNo) { - super(String.format("The provided VCF file is malformed at line number %d: %s", lineNo, message)); + super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java index ef0d9ca42b..b4ad81c02b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java @@ -25,10 +25,9 @@ package org.broadinstitute.sting.utils.gcf; import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -70,7 +69,7 @@ public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boo alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i)); } - qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual()); + qual = (float)vc.getLog10PError(); //qualToByte(vc.getPhredScaledQual()); info = infoFieldString(vc, GCFHeaderBuilder); filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc)); @@ -140,26 +139,26 @@ public int write(DataOutputStream outputStream) throws IOException { public VariantContext decode(final String source, final GCFHeader header) { final String contig = header.getString(chromOffset); alleleMap = header.getAlleles(alleleOffsets); - double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual); - Set filters = header.getFilters(filterOffset); - Map attributes = new HashMap(); - attributes.put("INFO", info); - Byte refPadByte = refPad == 0 ? null : refPad; - Map genotypes = decodeGenotypes(header); - - return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte); + + VariantContextBuilder builder = new VariantContextBuilder(source, contig, start, stop, alleleMap); + builder.genotypes(decodeGenotypes(header)); + builder.log10PError(qual); + builder.filters(header.getFilters(filterOffset)); + builder.attribute("INFO", info); + builder.referenceBaseForIndel(refPad == 0 ? null : refPad); + return builder.make(); } - private Map decodeGenotypes(final GCFHeader header) { + private GenotypesContext decodeGenotypes(final GCFHeader header) { if ( genotypes.isEmpty() ) return VariantContext.NO_GENOTYPES; else { - Map map = new TreeMap(); + GenotypesContext map = GenotypesContext.create(genotypes.size()); for ( int i = 0; i < genotypes.size(); i++ ) { final String sampleName = header.getSample(i); final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap); - map.put(sampleName, g); + map.add(g); } return map; @@ -172,7 +171,7 @@ private List encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilde List genotypes = new ArrayList(nGenotypes); for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null); - for ( Genotype g : vc.getGenotypes().values() ) { + for ( Genotype g : vc.getGenotypes() ) { int i = GCFHeaderBuilder.encodeSample(g.getSampleName()); genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g)); } @@ -192,8 +191,6 @@ private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder G boolean first = true; for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) - continue; int stringIndex = GCFHeaderBuilder.encodeString(key); String outputValue = StandardVCFWriter.formatVCFField(field.getValue()); if ( outputValue != null ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java index dd1fb091cb..f8fdd9291b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java @@ -84,14 +84,14 @@ protected int sizeInBytes() { public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List alleleIndex) { final List alleles = decodeAlleles(gt, alleleIndex); - final double negLog10PError = gq / 10.0; + final double log10PError = gq / -10.0; final Set filters = Collections.emptySet(); final Map attributes = new HashMap(); attributes.put("DP", dp); attributes.put("AD", ad); attributes.put("PL", pl); - return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false); + return new Genotype(sampleName, alleles, log10PError, filters, attributes, false); } private static int encodeAlleles(List gtList, List allAlleles) { diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f0e164c875..f8655f74a5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -56,29 +56,31 @@ public static List parseIntervalArguments(GenomeLocParser parser, Lis public static List parseIntervalArguments(GenomeLocParser parser, String arg) { List rawIntervals = new ArrayList(); // running list of raw GenomeLocs - // separate argument on semicolon first - for (String fileOrInterval : arg.split(";")) { - // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. - if (isUnmapped(fileOrInterval)) - rawIntervals.add(GenomeLoc.UNMAPPED); - // if it's a file, add items to raw interval list - else if (isIntervalFile(fileOrInterval)) { - try { - rawIntervals.addAll(intervalFileToList(parser, fileOrInterval)); - } - catch ( UserException.MalformedGenomeLoc e ) { - throw e; - } - catch ( Exception e ) { - throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in any supported format.", e); - } - } + if ( arg.indexOf(';') != -1 ) { + throw new UserException.BadArgumentValue("-L " + arg, "The legacy -L \"interval1;interval2\" syntax " + + "is no longer supported. Please use one -L argument for each " + + "interval or an interval file instead."); + } - // otherwise treat as an interval -> parse and add to raw interval list - else { - rawIntervals.add(parser.parseGenomeLoc(fileOrInterval)); + // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. + if (isUnmapped(arg)) + rawIntervals.add(GenomeLoc.UNMAPPED); + // if it's a file, add items to raw interval list + else if (isIntervalFile(arg)) { + try { + rawIntervals.addAll(intervalFileToList(parser, arg)); + } + catch ( UserException.MalformedGenomeLoc e ) { + throw e; + } + catch ( Exception e ) { + throw new UserException.MalformedFile(arg, "Interval file could not be parsed in any supported format.", e); } } + // otherwise treat as an interval -> parse and add to raw interval list + else { + rawIntervals.add(parser.parseGenomeLoc(arg)); + } return rawIntervals; } @@ -233,8 +235,12 @@ public static GenomeLocSortedSet sortAndMergeIntervals(GenomeLocParser parser, L * * Returns a null string if there are no differences, otherwise returns a string describing the difference * (useful for UnitTests). Assumes both lists are sorted + * + * @param masterArg sorted master genome locs + * @param testArg sorted test genome locs + * @return null string if there are no difference, otherwise a string describing the difference */ - public static final String equateIntervals(List masterArg, List testArg) { + public static String equateIntervals(List masterArg, List testArg) { LinkedList master = new LinkedList(masterArg); LinkedList test = new LinkedList(testArg); @@ -317,23 +323,6 @@ public static Map getContigSizes(File reference) { return lengths; } - /** - * Counts the number of interval files an interval list can be split into using scatterIntervalArguments. - * @param locs The genome locs. - * @return The maximum number of parts the intervals can be split into. - */ - public static int countContigIntervals(List locs) { - int maxFiles = 0; - String contig = null; - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - maxFiles++; - contig = loc.getContig(); - } - } - return maxFiles; - } - /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -373,7 +362,6 @@ public static void scatterContigIntervals(SAMFileHeader fileHeader, List> splitIntervalsToSubLists(List locs, List splits) { - int locIndex = 1; int start = 0; List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { @@ -465,7 +453,7 @@ public static List> splitLocusIntervals(List locs, in @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) @Ensures({"result != null"}) - final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { final List split = new ArrayList(); long size = 0; @@ -579,10 +567,101 @@ public static List mergeIntervalLocations(final List raw, } } - public static final long intervalSize(final List locs) { + public static long intervalSize(final List locs) { long size = 0; for ( final GenomeLoc loc : locs ) size += loc.size(); return size; } + + public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) { + ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); + GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference()); + List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath()); + + if (originalList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "File contains no intervals"); + + List flankingList = getFlankingIntervals(parser, originalList, basePairs); + + if (flankingList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals"); + + SAMFileHeader samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); + IntervalList intervalList = new IntervalList(samFileHeader); + int i = 0; + for (GenomeLoc loc: flankingList) + intervalList.add(toInterval(loc, ++i)); + intervalList.write(flankingIntervals); + } + + /** + * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. + * @param parser A genome loc parser for creating the new intervals + * @param locs Original genome locs + * @param basePairs Number of base pairs on each side of loc + * @return The list of intervals between the locs + */ + public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) { + List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList(); + + if (sorted.size() == 0) + return Collections.emptyList(); + + LinkedHashMap> locsByContig = splitByContig(sorted); + List expanded = new ArrayList(); + for (String contig: locsByContig.keySet()) { + List contigLocs = locsByContig.get(contig); + int contigLocsSize = contigLocs.size(); + + GenomeLoc startLoc, stopLoc; + + // Create loc at start of the list + startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs); + if (startLoc != null) + expanded.add(startLoc); + + // Create locs between each loc[i] and loc[i+1] + for (int i = 0; i < contigLocsSize - 1; i++) { + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs); + startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs); + if (stopLoc.getStop() + 1 >= startLoc.getStart()) { + // NOTE: This is different than GenomeLoc.merge() + // merge() returns a loc which covers the entire range of stop and start, + // possibly returning positions inside loc(i) or loc(i+1) + // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc + GenomeLoc merged = parser.createGenomeLoc( + stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop()); + expanded.add(merged); + } else { + expanded.add(stopLoc); + expanded.add(startLoc); + } + } + + // Create loc at the end of the list + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs); + if (stopLoc != null) + expanded.add(stopLoc); + } + return expanded; + } + + private static LinkedHashMap> splitByContig(List sorted) { + LinkedHashMap> splits = new LinkedHashMap>(); + GenomeLoc last = null; + List contigLocs = null; + for (GenomeLoc loc: sorted) { + if (GenomeLoc.isUnmapped(loc)) + continue; + if (last == null || !last.onSameContig(loc)) { + contigLocs = new ArrayList(); + splits.put(loc.getContig(), contigLocs); + } + contigLocs.add(loc); + last = loc; + } + return splits; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java index 94c2d4c0bd..b3fdb93d30 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -79,7 +79,7 @@ public static File tempDir(String prefix, String suffix, File tempDirParent) { tempDirParent = FileUtils.getTempDirectory(); if (!tempDirParent.exists() && !tempDirParent.mkdirs()) throw new UserException.BadTmpDir("Could not create temp directory: " + tempDirParent); - File temp = File.createTempFile(prefix + "-", suffix, tempDirParent); + File temp = File.createTempFile(prefix, suffix, tempDirParent); if (!temp.delete()) throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath()); if (!temp.mkdir()) @@ -362,4 +362,27 @@ public static void writeResource(Resource resource, File file) { org.apache.commons.io.IOUtils.closeQuietly(outputStream); } } + + /** + * Returns a file throwing a UserException if the file cannot be read. + * @param path File path + * @return LineIterator + */ + public static LineIterator lineIterator(String path) { + return lineIterator(new File(path)); + } + + /** + * Returns a file throwing a UserException if the file cannot be read. + * @param file File + * @return LineIterator + */ + public static LineIterator lineIterator(File file) { + try { + return FileUtils.lineIterator(file); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index daf6606ef7..2d13d6e59d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -95,11 +95,12 @@ else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() ) // -------------------------------------------------------------------------- public boolean isReducedRead() { - return ((GATKSAMRecord)read).isReducedRead(); + return read.isReducedRead(); } public int getRepresentativeCount() { - return isReducedRead() ? ((GATKSAMRecord)read).getReducedCount(offset) : 1; + // TODO -- if we ever decide to reduce the representation of deletions then this will need to be fixed + return (!isDeletion() && isReducedRead()) ? read.getReducedCount(offset) : 1; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 26fabade27..cedd56bdfb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -200,6 +200,48 @@ public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String n return rec; } + /** + * Create an artificial read based on the parameters + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar ) { + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); + rec.setCigarString(cigar); + return rec; + } + + /** + * Create an artificial read with the following default parameters : + * header: + * numberOfChromosomes = 1 + * startingChromosome = 1 + * chromosomeSize = 1000000 + * read: + * name = "default_read" + * refIndex = 0 + * alignmentStart = 1 + * + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar); + } + + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index ede75817a0..96713edc26 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.sam; +import com.google.java.contract.Ensures; import net.sf.samtools.*; import org.broadinstitute.sting.utils.NGSPlatform; @@ -43,7 +44,8 @@ * */ public class GATKSAMRecord extends BAMRecord { - public static final String REDUCED_READ_QUALITY_TAG = "RR"; + public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; + // the SAMRecord data we're caching private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; @@ -83,8 +85,13 @@ public GATKSAMRecord(final SAMRecord read) { read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getInferredInsertSize(), - new byte[]{}); - super.clearAttributes(); + null); + SAMReadGroupRecord samRG = read.getReadGroup(); + clearAttributes(); + if (samRG != null) { + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); + setReadGroup(rg); + } } public GATKSAMRecord(final SAMFileHeader header, @@ -131,6 +138,21 @@ public GATKSAMReadGroupRecord getReadGroup() { return mReadGroup; } + @Override + public int hashCode() { + return super.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + + if (!(o instanceof GATKSAMRecord)) return false; + + // note that we do not consider the GATKSAMRecord internal state at all + return super.equals(o); + } + /** * Efficient caching accessor that returns the GATK NGSPlatform of this read * @return @@ -142,17 +164,16 @@ public NGSPlatform getNGSPlatform() { public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { mReadGroup = readGroup; retrievedReadGroup = true; + setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! } - // - // - // Reduced read functions - // - // + /////////////////////////////////////////////////////////////////////////////// + // *** ReduceReads functions ***// + /////////////////////////////////////////////////////////////////////////////// public byte[] getReducedReadCounts() { if ( ! retrievedReduceReadCounts ) { - reducedReadCounts = getByteArrayAttribute(REDUCED_READ_QUALITY_TAG); + reducedReadCounts = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); retrievedReduceReadCounts = true; } @@ -164,9 +185,17 @@ public boolean isReducedRead() { } public final byte getReducedCount(final int i) { - return getReducedReadCounts()[i]; + byte firstCount = getReducedReadCounts()[0]; + byte offsetCount = getReducedReadCounts()[i]; + return (i==0) ? firstCount : (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); } + + /////////////////////////////////////////////////////////////////////////////// + // *** GATKSAMRecord specific methods ***// + /////////////////////////////////////////////////////////////////////////////// + + /** * Checks whether an attribute has been set for the given key. * @@ -220,18 +249,113 @@ public Object getTemporaryAttribute(Object key) { return null; } - @Override - public int hashCode() { - return super.hashCode(); + /** + * Checks whether if the read has any bases. + * + * Empty reads can be dangerous as it may have no cigar strings, no read names and + * other missing attributes. + * + * @return true if the read has no bases + */ + public boolean isEmpty() { + return super.getReadBases() == null || super.getReadLength() == 0; } - @Override - public boolean equals(Object o) { - if (this == o) return true; + /** + * Clears all attributes except ReadGroup of the read. + */ + public void simplify () { + GATKSAMReadGroupRecord rg = getReadGroup(); + this.clearAttributes(); + setReadGroup(rg); + } - if (!(o instanceof GATKSAMRecord)) return false; + /** + * Calculates the reference coordinate for the beginning of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped start of the read taking soft clips (but not hard clips) into account + */ + @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) + public int getSoftStart() { + int start = this.getUnclippedStart(); + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + start += cigarElement.getLength(); + else + break; + } + return start; + } - // note that we do not consider the GATKSAMRecord internal state at all - return super.equals(o); + /** + * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped end of the read taking soft clips (but not hard clips) into account + */ + @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) + public int getSoftEnd() { + int stop = this.getUnclippedStart(); + + if (ReadUtils.readIsEntirelyInsertion(this)) + return stop; + + int shift = 0; + CigarOperator lastOperator = null; + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + stop += shift; + lastOperator = cigarElement.getOperator(); + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) + shift = cigarElement.getLength(); + else + shift = 0; + } + return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + + /** + * Creates an empty GATKSAMRecord with the read's header, read group and mate + * information, but empty (not-null) fields: + * - Cigar String + * - Read Bases + * - Base Qualities + * + * Use this method if you want to create a new empty GATKSAMRecord based on + * another GATKSAMRecord + * + * @param read + * @return + */ + public static GATKSAMRecord emptyRead(GATKSAMRecord read) { + GATKSAMRecord emptyRead = new GATKSAMRecord(read.getHeader(), + read.getReferenceIndex(), + 0, + (short) 0, + (short) 0, + 0, + 0, + read.getFlags(), + 0, + read.getMateReferenceIndex(), + read.getMateAlignmentStart(), + read.getInferredInsertSize(), + null); + + emptyRead.setCigarString(""); + emptyRead.setReadBases(new byte[0]); + emptyRead.setBaseQualities(new byte[0]); + + SAMReadGroupRecord samRG = read.getReadGroup(); + emptyRead.clearAttributes(); + if (samRG != null) { + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); + emptyRead.setReadGroup(rg); + } + + return emptyRead; + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index e125b8c80e..f2e54713f3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -43,574 +43,212 @@ * @version 0.1 */ public class ReadUtils { - private ReadUtils() { } - - // ---------------------------------------------------------------------------------------------------- - // - // Reduced read utilities - // - // ---------------------------------------------------------------------------------------------------- - - // ---------------------------------------------------------------------------------------------------- - // - // General utilities - // - // ---------------------------------------------------------------------------------------------------- - public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { - SAMFileHeader copy = new SAMFileHeader(); - - copy.setSortOrder(toCopy.getSortOrder()); - copy.setGroupOrder(toCopy.getGroupOrder()); - copy.setProgramRecords(toCopy.getProgramRecords()); - copy.setReadGroups(toCopy.getReadGroups()); - copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - - for (Map.Entry e : toCopy.getAttributes()) - copy.setAttribute(e.getKey(), e.getValue()); - - return copy; + private ReadUtils() { } - public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { - if (file.endsWith(".bam")) - return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); - return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); - } + private static int DEFAULT_ADAPTOR_SIZE = 100; - public static boolean isPlatformRead(SAMRecord read, String name) { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup != null) { - Object readPlatformAttr = readGroup.getAttribute("PL"); - if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains(name); - } - return false; + /** + * A marker to tell which end of the read has been clipped + */ + public enum ClippingTail { + LEFT_TAIL, + RIGHT_TAIL } - // --------------------------------------------------------------------------------------------------------- - // - // utilities for detecting overlapping reads - // - // --------------------------------------------------------------------------------------------------------- - /** - * Detects read pairs where the reads are so long relative to the over fragment size that they are - * reading into each other's adaptors. - * - * Normally, fragments are sufficiently far apart that reads aren't reading into each other. - * - * |--------------------> first read - * <--------------------| second read - * - * Sometimes, mostly due to lab errors or constraints, fragment library are made too short relative to the - * length of the reads. For example, it's possible to have 76bp PE reads with 125 bp inserts, so that ~25 bp of each - * read overlaps with its mate. - * - * |--------OOOOOOOOOOOO> first read - * first read - * + * Note: This is not being used right now, but can be useful in the future */ + private static final Map readFlagNames = new HashMap(); - public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR} + static { + readFlagNames.put(0x1, "Paired"); + readFlagNames.put(0x2, "Proper"); + readFlagNames.put(0x4, "Unmapped"); + readFlagNames.put(0x8, "MateUnmapped"); + readFlagNames.put(0x10, "Forward"); + //readFlagNames.put(0x20, "MateForward"); + readFlagNames.put(0x40, "FirstOfPair"); + readFlagNames.put(0x80, "SecondOfPair"); + readFlagNames.put(0x100, "NotPrimary"); + readFlagNames.put(0x200, "NON-PF"); + readFlagNames.put(0x400, "Duplicate"); + } /** * This enum represents all the different ways in which a read can overlap an interval. - * + *

* NO_OVERLAP_CONTIG: * read and interval are in different contigs. - * + *

* NO_OVERLAP_LEFT: * the read does not overlap the interval. - * - * |----------------| (interval) - * <----------------> (read) - * + *

+ * |----------------| (interval) + * <----------------> (read) + *

* NO_OVERLAP_RIGHT: * the read does not overlap the interval. - * - * |----------------| (interval) - * <----------------> (read) - * + *

+ * |----------------| (interval) + * <----------------> (read) + *

* OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it - * - * |----------------| (interval) - * <----------------> (read) - * + *

+ * |----------------| (interval) + * <----------------> (read) + *

* OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it - * - * |----------------| (interval) - * <----------------> (read) - * + *

+ * |----------------| (interval) + * <----------------> (read) + *

* OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval - * - * |-----------| (interval) - * <-------------------> (read) - * + *

+ * |-----------| (interval) + * <-------------------> (read) + *

* OVERLAP_CONTAINED: * the read starts and ends inside the interval - * - * |----------------| (interval) - * <--------> (read) + *

+ * |----------------| (interval) + * <--------> (read) */ - public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} - - /** - * God, there's a huge information asymmetry in SAM format: - * - * s1 e1 - * |-----------------------> [record in hand] - * s2 - * <-----------------------| - * - * s1, e1, and s2 are all in the record. From isize we can can compute e2 as s1 + isize + 1 - * - * s2 - * |-----------------------> - * s1 e1 - * <-----------------------| [record in hand] - * - * Here we cannot calculate e2 since the record carries s2 and e1 + isize is s2 now! - * - * This makes the following code a little nasty, since we can only detect if a base is in the adaptor, but not - * if it overlaps the read. - * - * @param read - * @param basePos - * @param adaptorLength - * @return - */ - public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos, final int adaptorLength) { - OverlapType state = OverlapType.NOT_OVERLAPPING; - - Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); - - if ( adaptorBoundaries != null ) { // we're not an unmapped pair -- cannot filter out - - boolean inAdapator = basePos >= adaptorBoundaries.first && basePos <= adaptorBoundaries.second; - - if ( inAdapator ) { - state = OverlapType.IN_ADAPTOR; - //System.out.printf("baseOverlapState: %50s negStrand=%b base=%d start=%d stop=%d, adaptorStart=%d adaptorEnd=%d isize=%d => %s%n", - // read.getReadName(), read.getReadNegativeStrandFlag(), basePos, read.getAlignmentStart(), read.getAlignmentEnd(), adaptorBoundaries.first, adaptorBoundaries.second, read.getInferredInsertSize(), state); - } - } - - return state; - } - - private static Pair getAdaptorBoundaries(SAMRecord read, int adaptorLength) { - int isize = read.getInferredInsertSize(); - if ( isize == 0 ) - return null; // don't worry about unmapped pairs - - int adaptorStart, adaptorEnd; - - if ( read.getReadNegativeStrandFlag() ) { - // we are on the negative strand, so our mate is on the positive strand - int mateStart = read.getMateAlignmentStart(); - adaptorStart = mateStart - adaptorLength - 1; - adaptorEnd = mateStart - 1; - } else { - // we are on the positive strand, so our mate is on the negative strand - int mateEnd = read.getAlignmentStart() + isize - 1; - adaptorStart = mateEnd + 1; - adaptorEnd = mateEnd + adaptorLength; - } - - return new Pair(adaptorStart, adaptorEnd); + public enum ReadAndIntervalOverlap { + NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED } /** - * - * @param read original SAM record - * @param adaptorLength length of adaptor sequence - * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped + * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular + * SAMFileWriter without compression otherwise. + * + * @param header + * @param presorted + * @param file + * @param compression + * @return a SAMFileWriter with the compression level if it is a bam. */ - public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) { - - Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); - GATKSAMRecord result = (GATKSAMRecord)read; - - if ( adaptorBoundaries != null ) { - if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() ) - result = hardClipStartOfRead(read, adaptorBoundaries.second); - else if ( !read.getReadNegativeStrandFlag() && adaptorBoundaries.first <= read.getAlignmentEnd() ) - result = hardClipEndOfRead(read, adaptorBoundaries.first); - } - - return result; - } - - // return true if the read needs to be completely clipped - private static GATKSAMRecord hardClipStartOfRead(GATKSAMRecord oldRec, int stopPosition) { - - if ( stopPosition >= oldRec.getAlignmentEnd() ) { - // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); - return null; - } - - GATKSAMRecord read; - try { - read = (GATKSAMRecord)oldRec.clone(); - } catch (Exception e) { - return null; - } - - //System.out.printf("Clipping start of read: %50s start=%d adaptorEnd=%d isize=%d %n", - // read.getReadName(), read.getAlignmentStart(), stopPosition, read.getInferredInsertSize()); - - Cigar oldCigar = read.getCigar(); - LinkedList newCigarElements = new LinkedList(); - int currentPos = read.getAlignmentStart(); - int basesToClip = 0; - int basesAlreadyClipped = 0; - - for ( CigarElement ce : oldCigar.getCigarElements() ) { - - if ( currentPos > stopPosition) { - newCigarElements.add(ce); - continue; - } - - int elementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int i = 0; i < elementLength; i++, currentPos++, basesToClip++) { - if ( currentPos > stopPosition ) { - newCigarElements.add(new CigarElement(elementLength - i, CigarOperator.M)); - break; - } - } - break; - case I: - case S: - basesToClip += elementLength; - break; - case D: - case N: - currentPos += elementLength; - break; - case H: - basesAlreadyClipped += elementLength; - case P: - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - int newLength = bases.length - basesToClip; - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - System.arraycopy(bases, basesToClip, newBases, 0, newLength); - System.arraycopy(quals, basesToClip, newQuals, 0, newLength); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases - newCigarElements.addFirst(new CigarElement(basesToClip + basesAlreadyClipped, CigarOperator.H)); - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - - // adjust the start accordingly - read.setAlignmentStart(stopPosition + 1); - - return read; - } - - private static GATKSAMRecord hardClipEndOfRead(GATKSAMRecord oldRec, int startPosition) { - - if ( startPosition <= oldRec.getAlignmentStart() ) { - // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); - return null; - } - - GATKSAMRecord read; - try { - read = (GATKSAMRecord)oldRec.clone(); - } catch (Exception e) { - return null; - } - - //System.out.printf("Clipping end of read: %50s adaptorStart=%d end=%d isize=%d %n", - // read.getReadName(), startPosition, read.getAlignmentEnd(), read.getInferredInsertSize()); - - Cigar oldCigar = read.getCigar(); - LinkedList newCigarElements = new LinkedList(); - int currentPos = read.getAlignmentStart(); - int basesToKeep = 0; - int basesAlreadyClipped = 0; - - for ( CigarElement ce : oldCigar.getCigarElements() ) { - - int elementLength = ce.getLength(); - - if ( currentPos >= startPosition ) { - if ( ce.getOperator() == CigarOperator.H ) - basesAlreadyClipped += elementLength; - continue; - } - - switch ( ce.getOperator() ) { - case M: - for (int i = 0; i < elementLength; i++, currentPos++, basesToKeep++) { - if ( currentPos == startPosition ) { - newCigarElements.add(new CigarElement(i, CigarOperator.M)); - break; - } - } - - if ( currentPos != startPosition ) - newCigarElements.add(ce); - break; - case I: - case S: - newCigarElements.add(ce); - basesToKeep += elementLength; - break; - case D: - case N: - newCigarElements.add(ce); - currentPos += elementLength; - break; - case H: - case P: - newCigarElements.add(ce); - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - byte[] newBases = new byte[basesToKeep]; - byte[] newQuals = new byte[basesToKeep]; - System.arraycopy(bases, 0, newBases, 0, basesToKeep); - System.arraycopy(quals, 0, newQuals, 0, basesToKeep); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases - newCigarElements.add(new CigarElement((bases.length - basesToKeep) + basesAlreadyClipped, CigarOperator.H)); - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - - // adjust the stop accordingly - // read.setAlignmentEnd(startPosition - 1); - - return read; + public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { + if (file.endsWith(".bam")) + return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); + return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); } /** - * Hard clips away (i.e.g, removes from the read) bases that were previously soft clipped. - * - * @param read - * @return + * is this base inside the adaptor of the read? + *

+ * There are two cases to treat here: + *

+ * 1) Read is in the negative strand => Adaptor boundary is on the left tail + * 2) Read is in the positive strand => Adaptor boundary is on the right tail + *

+ * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) + * + * @param read the read to test + * @param basePos base position in REFERENCE coordinates (not read coordinates) + * @return whether or not the base is in the adaptor */ - @Requires("read != null") - @Ensures("result != null") - public static GATKSAMRecord hardClipSoftClippedBases(GATKSAMRecord read) { - List cigarElts = read.getCigar().getCigarElements(); - - if ( cigarElts.size() == 1 ) // can't be soft clipped, just return - return read; - - int keepStart = 0, keepEnd = read.getReadLength() - 1; - List newCigarElements = new LinkedList(); - - for ( int i = 0; i < cigarElts.size(); i++ ) { - CigarElement ce = cigarElts.get(i); - int l = ce.getLength(); - switch ( ce.getOperator() ) { - case S: - if ( i == 0 ) - keepStart = l; - else - keepEnd = read.getReadLength() - l - 1; - newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); - break; - - default: - newCigarElements.add(ce); - break; - } - } - - // Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S - // this will happen if you soft clip a read that has been hard clipped before - // like: 5H20S => 5H20H - List mergedCigarElements = new LinkedList(); - Iterator cigarElementIterator = newCigarElements.iterator(); - CigarOperator currentOperator = null; - int currentOperatorLength = 0; - while (cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - if (currentOperator != cigarElement.getOperator()) { - if (currentOperator != null) - mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); - currentOperator = cigarElement.getOperator(); - currentOperatorLength = cigarElement.getLength(); - } - else - currentOperatorLength += cigarElement.getLength(); - } - mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); + public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { + Integer adaptorBoundary = getAdaptorBoundary(read); + if (adaptorBoundary == null || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) + return false; - return hardClipBases(read, keepStart, keepEnd, mergedCigarElements); + return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; } /** - * Hard clips out the bases in read, keeping the bases from keepStart to keepEnd, inclusive. Note these - * are offsets, so they are 0 based - * - * @param read - * @param keepStart - * @param keepEnd - * @param newCigarElements - * @return + * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to + * the read boundary. If the read is in the positive strand, this is the first base after the end of the + * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the + * beginning of the fragment. + *

+ * There are two cases we need to treat here: + *

+ * 1) Our read is in the reverse strand : + *

+ * <----------------------| * + * |---------------------> + *

+ * in these cases, the adaptor boundary is at the mate start (minus one) + *

+ * 2) Our read is in the forward strand : + *

+ * |----------------------> * + * <----------------------| + *

+ * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * @param read the read being tested for the adaptor boundary + * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig. */ - @Requires({ - "read != null", - "keepStart >= 0", - "keepEnd < read.getReadLength()", - "read.getReadUnmappedFlag() || newCigarElements != null"}) - @Ensures("result != null") - public static GATKSAMRecord hardClipBases(GATKSAMRecord read, int keepStart, int keepEnd, List newCigarElements) { - int newLength = keepEnd - keepStart + 1; - if ( newLength != read.getReadLength() ) { - try { - read = (GATKSAMRecord)read.clone(); - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - System.arraycopy(bases, keepStart, newBases, 0, newLength); - System.arraycopy(quals, keepStart, newQuals, 0, newLength); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases, if the read isn't unmapped - if ( ! read.getReadUnmappedFlag() ) { - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - } - } catch ( CloneNotSupportedException e ) { - throw new ReviewedStingException("WTF, where did clone go?", e); - } - } - - return read; - } - - public static GATKSAMRecord replaceSoftClipsWithMatches(GATKSAMRecord read) { - List newCigarElements = new ArrayList(); - - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.SOFT_CLIP ) - newCigarElements.add(new CigarElement(ce.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - else - newCigarElements.add(ce); - } + public static Integer getAdaptorBoundary(final SAMRecord read) { + final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) - if ( newCigarElements.size() > 1 ) { // - CigarElement first = newCigarElements.get(0); - CigarElement second = newCigarElements.get(1); - if ( first.getOperator() == CigarOperator.MATCH_OR_MISMATCH && second.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { - newCigarElements.set(0, new CigarElement(first.getLength() + second.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - newCigarElements.remove(1); - } - } + if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another + return null; // chromosome or unmapped pairs - if ( newCigarElements.size() > 1 ) { // - CigarElement penult = newCigarElements.get(newCigarElements.size()-2); - CigarElement last = newCigarElements.get(newCigarElements.size()-1); - if ( penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH && penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { - newCigarElements.set(newCigarElements.size()-2, new CigarElement(penult.getLength() + last.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - newCigarElements.remove(newCigarElements.size()-1); - } - } + int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) + if (read.getReadNegativeStrandFlag()) + adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) + else + adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header) - read.setCigar(new Cigar(newCigarElements)); - return read; + return adaptorBoundary; } - - private static int DEFAULT_ADAPTOR_SIZE = 100; - /** + * is the read a 454 read ? * - * @param read original SAM record - * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped + * @param read the read to test + * @return checks the read group tag PL for the default 454 tag */ - public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read) { - return hardClipAdaptorSequence(read, DEFAULT_ADAPTOR_SIZE); - } - - public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos) { - return readPairBaseOverlapType(read, basePos, DEFAULT_ADAPTOR_SIZE); - } - public static boolean is454Read(SAMRecord read) { return isPlatformRead(read, "454"); } + /** + * is the read a SOLiD read ? + * + * @param read the read to test + * @return checks the read group tag PL for the default SOLiD tag + */ public static boolean isSOLiDRead(SAMRecord read) { return isPlatformRead(read, "SOLID"); } + /** + * is the read a SLX read ? + * + * @param read the read to test + * @return checks the read group tag PL for the default SLX tag + */ public static boolean isSLXRead(SAMRecord read) { return isPlatformRead(read, "ILLUMINA"); } - private static final Map readFlagNames - = new HashMap(); - - static { - readFlagNames.put(0x1, "Paired"); - readFlagNames.put(0x2, "Proper"); - readFlagNames.put(0x4, "Unmapped"); - readFlagNames.put(0x8, "MateUnmapped"); - readFlagNames.put(0x10, "Forward"); - //readFlagNames.put(0x20, "MateForward"); - readFlagNames.put(0x40, "FirstOfPair"); - readFlagNames.put(0x80, "SecondOfPair"); - readFlagNames.put(0x100, "NotPrimary"); - readFlagNames.put(0x200, "NON-PF"); - readFlagNames.put(0x400, "Duplicate"); - } - - public static String readFlagsAsString(GATKSAMRecord read) { - String flags = ""; - for (int flag : readFlagNames.keySet()) { - if ((read.getFlags() & flag) != 0) { - flags += readFlagNames.get(flag) + " "; - } + /** + * checks if the read has a platform tag in the readgroup equal to 'name' ? + * + * @param read the read to test + * @param name the platform name to test + * @return whether or not name == PL tag in the read group of read + */ + public static boolean isPlatformRead(SAMRecord read, String name) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if (readGroup != null) { + Object readPlatformAttr = readGroup.getAttribute("PL"); + if (readPlatformAttr != null) + return readPlatformAttr.toString().toUpperCase().contains(name); } - return flags; + return false; } + /** * Returns the collections of reads sorted in coordinate order, according to the order defined * in the reads themselves @@ -618,23 +256,39 @@ public static String readFlagsAsString(GATKSAMRecord read) { * @param reads * @return */ - public final static List coordinateSortReads(List reads) { + public final static List sortReadsByCoordinate(List reads) { final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); Collections.sort(reads, comparer); return reads; } + /** + * If a read starts in INSERTION, returns the first element length. + *

+ * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. + * + * @param read + * @return the length of the first insertion, or 0 if there is none (see warning). + */ public final static int getFirstInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(0); - if ( e.getOperator() == CigarOperator.I ) + if (e.getOperator() == CigarOperator.I) return e.getLength(); else return 0; } + /** + * If a read ends in INSERTION, returns the last element length. + *

+ * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. + * + * @param read + * @return the length of the last insertion, or 0 if there is none (see warning). + */ public final static int getLastInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1); - if ( e.getOperator() == CigarOperator.I ) + CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); + if (e.getOperator() == CigarOperator.I) return e.getLength(); else return 0; @@ -643,92 +297,48 @@ public final static int getLastInsertionOffset(SAMRecord read) { /** * Determines what is the position of the read in relation to the interval. * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. - * @param read the read + * + * @param read the read * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { - int sStart = getRefCoordSoftUnclippedStart(read); - int sStop = getRefCoordSoftUnclippedEnd(read); + int sStart = read.getSoftStart(); + int sStop = read.getSoftEnd(); int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); - if ( !read.getReferenceName().equals(interval.getContig()) ) + if (!read.getReferenceName().equals(interval.getContig())) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - else if ( uStop < interval.getStart() ) + else if (uStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - else if ( uStart > interval.getStop() ) + else if (uStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - else if ( sStop < interval.getStart() ) + else if (sStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; - else if ( sStart > interval.getStop() ) + else if (sStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; - else if ( (sStart >= interval.getStart()) && - (sStop <= interval.getStop()) ) + else if ((sStart >= interval.getStart()) && + (sStop <= interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - else if ( (sStart < interval.getStart()) && - (sStop > interval.getStop()) ) + else if ((sStart < interval.getStart()) && + (sStop > interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - else if ( (sStart < interval.getStart()) ) + else if ((sStart < interval.getStart())) return ReadAndIntervalOverlap.OVERLAP_LEFT; else return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) - public static int getRefCoordSoftUnclippedStart(GATKSAMRecord read) { - int start = read.getUnclippedStart(); - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - start += cigarElement.getLength(); - else - break; - } - return start; - } - - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) - public static int getRefCoordSoftUnclippedEnd(GATKSAMRecord read) { - int stop = read.getUnclippedStart(); - - if (readIsEntirelyInsertion(read)) - return stop; - - int shift = 0; - CigarOperator lastOperator = null; - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - stop += shift; - lastOperator = cigarElement.getOperator(); - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) - shift = cigarElement.getLength(); - else - shift = 0; - } - return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; - } - - private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() != CigarOperator.INSERTION) - return false; - } - return true; - } - - public enum ClippingTail { - LEFT_TAIL, - RIGHT_TAIL - } - /** * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in * a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns @@ -754,12 +364,12 @@ public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, in /** * Returns the read coordinate corresponding to the requested reference coordinate. - * + *

* WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function * will return the last read base before the deletion. This function returns a * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with * a deletion. - * + *

* SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a * pre-processed result according to normal clipping needs. Or you can use this function and tailor the * behavior to your needs. @@ -768,14 +378,14 @@ public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, in * @param refCoord * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) */ - @Requires({"refCoord >= getRefCoordSoftUnclippedStart(read)", "refCoord <= getRefCoordSoftUnclippedEnd(read)"}) + @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletion = false; - int goal = refCoord - getRefCoordSoftUnclippedStart(read); // The goal is to move this many reference bases + int goal = refCoord - read.getSoftStart(); // The goal is to move this many reference bases boolean goalReached = refBases == goal; Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); @@ -798,7 +408,7 @@ public static Pair getReadCoordinateForReferenceCoordinate(GAT if (goalReached) { // Is this base's reference position within this cigar element? Or did we use it all? - boolean endsWithinCigar = shift < cigarElement.getLength(); + boolean endsWithinCigar = shift < cigarElement.getLength(); // If it isn't, we need to check the next one. There should *ALWAYS* be a next one // since we checked if the goal coordinate is within the read length, so this is just a sanity check. @@ -811,7 +421,7 @@ public static Pair getReadCoordinateForReferenceCoordinate(GAT if (endsWithinCigar) fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. else { nextCigarElement = cigarElementIterator.next(); @@ -832,74 +442,30 @@ public static Pair getReadCoordinateForReferenceCoordinate(GAT if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) readBases += shift; - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion else if (fallsInsideDeletion && endsWithinCigar) readBases--; - } - } - - if (!goalReached) - throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - - - return new Pair(readBases, fallsInsideDeletion); - } - - public static GATKSAMRecord unclipSoftClippedBases(GATKSAMRecord read) { - int newReadStart = read.getAlignmentStart(); - int newReadEnd = read.getAlignmentEnd(); - List newCigarElements = new ArrayList(read.getCigar().getCigarElements().size()); - int heldOver = -1; - boolean sSeen = false; - for ( CigarElement e : read.getCigar().getCigarElements() ) { - if ( e.getOperator().equals(CigarOperator.S) ) { - newCigarElements.add(new CigarElement(e.getLength(),CigarOperator.M)); - if ( sSeen ) { - newReadEnd += e.getLength(); - sSeen = true; - } else { - newReadStart -= e.getLength(); - } - } else { - newCigarElements.add(e); } } - // merge duplicate operators together - int idx = 0; - List finalCigarElements = new ArrayList(read.getCigar().getCigarElements().size()); - while ( idx < newCigarElements.size() -1 ) { - if ( newCigarElements.get(idx).getOperator().equals(newCigarElements.get(idx+1).getOperator()) ) { - int combSize = newCigarElements.get(idx).getLength(); - int offset = 0; - while ( idx + offset < newCigarElements.size()-1 && newCigarElements.get(idx+offset).getOperator().equals(newCigarElements.get(idx+1+offset).getOperator()) ) { - combSize += newCigarElements.get(idx+offset+1).getLength(); - offset++; - } - finalCigarElements.add(new CigarElement(combSize,newCigarElements.get(idx).getOperator())); - idx = idx + offset -1; - } else { - finalCigarElements.add(newCigarElements.get(idx)); - } - idx++; - } - read.setCigar(new Cigar(finalCigarElements)); - read.setAlignmentStart(newReadStart); + if (!goalReached) + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + - return read; + return new Pair(readBases, fallsInsideDeletion); } /** * Compares two SAMRecords only the basis on alignment start. Note that * comparisons are performed ONLY on the basis of alignment start; any * two SAM records with the same alignment start will be considered equal. - * + *

* Unmapped alignments will all be considered equal. */ @@ -910,4 +476,30 @@ public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { return comp.compare(read1, read2); } + /** + * Is a base inside a read? + * + * @param read the read to evaluate + * @param referenceCoordinate the reference coordinate of the base to test + * @return true if it is inside the read, false otherwise. + */ + public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { + return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); + } + + /** + * Is this read all insertion? + * + * @param read + * @return whether or not the only element in the cigar string is an Insertion + */ + public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } + + } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java similarity index 78% rename from public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java rename to public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java index bf16cd1cf4..c0c9f36ce6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/CommonInfo.java @@ -11,33 +11,24 @@ * * @author depristo */ -public final class InferredGeneticContext { - public static final double NO_NEG_LOG_10PERROR = -1.0; +final class CommonInfo { + public static final double NO_LOG10_PERROR = 1.0; - private static Set NO_FILTERS = Collections.unmodifiableSet(new HashSet()); + private static Set NO_FILTERS = Collections.emptySet(); private static Map NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap()); - private double negLog10PError = NO_NEG_LOG_10PERROR; + private double log10PError = NO_LOG10_PERROR; private String name = null; - private Set filters = NO_FILTERS; + private Set filters = null; private Map attributes = NO_ATTRIBUTES; -// public InferredGeneticContext(String name) { -// this.name = name; -// } -// -// public InferredGeneticContext(String name, double negLog10PError) { -// this(name); -// setNegLog10PError(negLog10PError); -// } - - public InferredGeneticContext(String name, double negLog10PError, Set filters, Map attributes) { + public CommonInfo(String name, double log10PError, Set filters, Map attributes) { this.name = name; - setNegLog10PError(negLog10PError); - if ( filters != null ) - setFilters(filters); - if ( attributes != null ) - setAttributes(attributes); + setLog10PError(log10PError); + this.filters = filters; + if ( attributes != null && ! attributes.isEmpty() ) { + this.attributes = attributes; + } } /** @@ -64,12 +55,20 @@ public void setName(String name) { // // --------------------------------------------------------------------------------------------------------- + public Set getFiltersMaybeNull() { + return filters; + } + public Set getFilters() { - return Collections.unmodifiableSet(filters); + return filters == null ? NO_FILTERS : Collections.unmodifiableSet(filters); + } + + public boolean filtersWereApplied() { + return filters != null; } public boolean isFiltered() { - return filters.size() > 0; + return filters == null ? false : filters.size() > 0; } public boolean isNotFiltered() { @@ -77,8 +76,8 @@ public boolean isNotFiltered() { } public void addFilter(String filter) { - if ( filters == NO_FILTERS ) // immutable -> mutable - filters = new HashSet(filters); + if ( filters == null ) // immutable -> mutable + filters = new HashSet(); if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this); if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this); @@ -91,37 +90,30 @@ public void addFilters(Collection filters) { addFilter(f); } - public void clearFilters() { - filters = new HashSet(); - } - - public void setFilters(Collection filters) { - clearFilters(); - addFilters(filters); - } - // --------------------------------------------------------------------------------------------------------- // // Working with log error rates // // --------------------------------------------------------------------------------------------------------- - public boolean hasNegLog10PError() { - return getNegLog10PError() != NO_NEG_LOG_10PERROR; + public boolean hasLog10PError() { + return getLog10PError() != NO_LOG10_PERROR; } /** * @return the -1 * log10-based error estimate */ - public double getNegLog10PError() { return negLog10PError; } - public double getPhredScaledQual() { return getNegLog10PError() * 10; } - - public void setNegLog10PError(double negLog10PError) { - if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError); - if ( Double.isInfinite(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be Infinity"); - if ( Double.isNaN(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be NaN"); + public double getLog10PError() { return log10PError; } + public double getPhredScaledQual() { return getLog10PError() * -10; } - this.negLog10PError = negLog10PError; + public void setLog10PError(double log10PError) { + if ( log10PError > 0 && log10PError != NO_LOG10_PERROR) + throw new IllegalArgumentException("BUG: log10PError cannot be > 0 : " + this.log10PError); + if ( Double.isInfinite(this.log10PError) ) + throw new IllegalArgumentException("BUG: log10PError should not be Infinity"); + if ( Double.isNaN(this.log10PError) ) + throw new IllegalArgumentException("BUG: log10PError should not be NaN"); + this.log10PError = log10PError; } // --------------------------------------------------------------------------------------------------------- @@ -157,7 +149,7 @@ public void putAttribute(String key, Object value, boolean allowOverwrites) { if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable attributes = new HashMap(); - + attributes.put(key, value); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index e2e44e2b9b..1691129c94 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -12,30 +12,28 @@ * * @author Mark DePristo */ -public class Genotype { +public class Genotype implements Comparable { public final static String PHASED_ALLELE_SEPARATOR = "|"; public final static String UNPHASED_ALLELE_SEPARATOR = "/"; - protected InferredGeneticContext commonInfo; - public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; + protected CommonInfo commonInfo; + public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; protected List alleles = null; // new ArrayList(); protected Type type = null; protected boolean isPhased = false; - protected boolean filtersWereAppliedToContext; - public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { - this(sampleName, alleles, negLog10PError, filters, attributes, isPhased, null); + public Genotype(String sampleName, List alleles, double log10PError, Set filters, Map attributes, boolean isPhased) { + this(sampleName, alleles, log10PError, filters, attributes, isPhased, null); } - public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { + public Genotype(String sampleName, List alleles, double log10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { if ( alleles != null ) this.alleles = Collections.unmodifiableList(alleles); - commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); + commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes); if ( log10Likelihoods != null ) commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods)); - filtersWereAppliedToContext = filters != null; this.isPhased = isPhased; validate(); } @@ -44,22 +42,27 @@ public Genotype(String sampleName, List alleles, double negLog10PError, * Creates a new Genotype for sampleName with genotype according to alleles. * @param sampleName * @param alleles - * @param negLog10PError the confidence in these alleles + * @param log10PError the confidence in these alleles * @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known */ - public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) { - this(sampleName, alleles, negLog10PError, null, null, false, log10Likelihoods); + public Genotype(String sampleName, List alleles, double log10PError, double[] log10Likelihoods) { + this(sampleName, alleles, log10PError, null, null, false, log10Likelihoods); } - public Genotype(String sampleName, List alleles, double negLog10PError) { - this(sampleName, alleles, negLog10PError, null, null, false); + public Genotype(String sampleName, List alleles, double log10PError) { + this(sampleName, alleles, log10PError, null, null, false); } public Genotype(String sampleName, List alleles) { - this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false); + this(sampleName, alleles, NO_LOG10_PERROR, null, null, false); + } + + public Genotype(String sampleName, Genotype parent) { + this(sampleName, parent.getAlleles(), parent.getLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); } + // --------------------------------------------------------------------------------------------------------- // // Partial-cloning routines (because Genotype is immutable). @@ -67,15 +70,15 @@ public Genotype(String sampleName, List alleles) { // --------------------------------------------------------------------------------------------------------- public static Genotype modifyName(Genotype g, String name) { - return new Genotype(name, g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); + return new Genotype(name, g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); } public static Genotype modifyAttributes(Genotype g, Map attributes) { - return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); + return new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); } public static Genotype modifyAlleles(Genotype g, List alleles) { - return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); + return new Genotype(g.getSampleName(), alleles, g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); } /** @@ -328,11 +331,12 @@ private static , V> String sortedString(Map c) { // --------------------------------------------------------------------------------------------------------- public String getSampleName() { return commonInfo.getName(); } public Set getFilters() { return commonInfo.getFilters(); } + public Set getFiltersMaybeNull() { return commonInfo.getFiltersMaybeNull(); } public boolean isFiltered() { return commonInfo.isFiltered(); } public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } - public boolean filtersWereApplied() { return filtersWereAppliedToContext; } - public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } - public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); } + public boolean hasLog10PError() { return commonInfo.hasLog10PError(); } + public double getLog10PError() { return commonInfo.getLog10PError(); } public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } public Map getAttributes() { return commonInfo.getAttributes(); } @@ -347,4 +351,14 @@ public Object getAttribute(String key, Object defaultValue) { public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } + + /** + * comparable genotypes -> compareTo on the sample names + * @param genotype + * @return + */ + @Override + public int compareTo(final Genotype genotype) { + return getSampleName().compareTo(genotype.getSampleName()); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index dba16cf86b..a5e4e5774a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,7 +25,13 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.gatk.io.DirectOutputTracker; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.jgrapht.util.MathUtil; + +import java.util.EnumMap; +import java.util.Map; public class GenotypeLikelihoods { public static final boolean CAP_PLS = false; @@ -94,6 +100,53 @@ public String getAsString() { return likelihoodsAsString_PLs; } + //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values + //Returns null in case of missing likelihoods + public EnumMap getAsMap(boolean normalizeFromLog10){ + //Make sure that the log10likelihoods are set + double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + if(likelihoods == null) + return null; + EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); + likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]); + return likelihoodsMap; + } + + //Return the neg log10 Genotype Quality (GQ) for the given genotype + //Returns Double.NEGATIVE_INFINITY in case of missing genotype + public double getLog10GQ(Genotype.Type genotype){ + return getQualFromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); + } + + public static double getQualFromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ + if(likelihoods == null) + return Double.NEGATIVE_INFINITY; + + double qual = Double.NEGATIVE_INFINITY; + for (int i=0; i < likelihoods.length; i++) { + if (i==iOfChoosenGenotype) + continue; + if (likelihoods[i] >= qual) + qual = likelihoods[i]; + } + + // qual contains now max(likelihoods[k]) for all k != bestGTguess + qual = likelihoods[iOfChoosenGenotype] - qual; + + if (qual < 0) { + // QUAL can be negative if the chosen genotype is not the most likely one individually. + // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one + double[] normalized = MathUtils.normalizeFromLog10(likelihoods); + double chosenGenotype = normalized[iOfChoosenGenotype]; + return Math.log10(1.0 - chosenGenotype); + } else { + // invert the size, as this is the probability of making an error + return -1 * qual; + } + } + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java new file mode 100644 index 0000000000..85f7cc078a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; + +import java.util.*; + +/** + * Represents an ordered collection of Genotype objects + */ +public class GenotypesContext implements List { + /** + * static constant value for an empty GenotypesContext. Useful since so many VariantContexts have no genotypes + */ + public final static GenotypesContext NO_GENOTYPES = + new GenotypesContext(new ArrayList(0), new HashMap(0), Collections.emptyList()).immutable(); + + /** + *sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical order + */ + List sampleNamesInOrder = null; + + /** + * a map optimized for efficient lookup. Each genotype in genotypes must have its + * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that + * genotype in the vector of genotypes + */ + Map sampleNameToOffset = null; + + /** + * An ArrayList of genotypes contained in this context + * + * WARNING: TO ENABLE THE LAZY VERSION OF THIS CLASS, NO METHODS SHOULD DIRECTLY + * ACCESS THIS VARIABLE. USE getGenotypes() INSTEAD. + * + */ + ArrayList notToBeDirectlyAccessedGenotypes; + + /** Are we allowing users to modify the list? */ + boolean immutable = false; + + // --------------------------------------------------------------------------- + // + // private constructors -- you have to use static create methods to make these classes + // + // --------------------------------------------------------------------------- + + /** + * Create an empty GenotypeContext + */ + protected GenotypesContext() { + this(10); + } + + /** + * Create an empty GenotypeContext, with initial capacity for n elements + */ + @Requires("n >= 0") + protected GenotypesContext(final int n) { + this(new ArrayList(n)); + } + + /** + * Create an GenotypeContext containing genotypes + */ + @Requires({"genotypes != null", "noDups(genotypes)"}) + protected GenotypesContext(final ArrayList genotypes) { + this.notToBeDirectlyAccessedGenotypes = genotypes; + this.sampleNameToOffset = null; + } + + /** + * Create a fully resolved GenotypeContext containing genotypes, sample lookup table, + * and sorted sample names + * + * @param genotypes our genotypes in arbitrary + * @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its + * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that + * genotype in the vector of genotypes + * @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical + * order. + */ + @Requires({"genotypes != null", "noDups(genotypes)", + "sampleNameToOffset != null", + "sampleNamesInOrder != null", + "genotypes.size() == sampleNameToOffset.size()", + "genotypes.size() == sampleNamesInOrder.size()"}) + protected GenotypesContext(final ArrayList genotypes, + final Map sampleNameToOffset, + final List sampleNamesInOrder) { + this.notToBeDirectlyAccessedGenotypes = genotypes; + this.sampleNameToOffset = sampleNameToOffset; + this.sampleNamesInOrder = sampleNamesInOrder; + } + + // --------------------------------------------------------------------------- + // + // public static factory methods + // + // --------------------------------------------------------------------------- + + /** + * Basic creation routine + * @return an empty, mutable GenotypeContext + */ + @Ensures({"result != null"}) + public static final GenotypesContext create() { + return new GenotypesContext(); + } + + /** + * Basic creation routine + * @return an empty, mutable GenotypeContext with initial capacity for nGenotypes + */ + @Requires("nGenotypes >= 0") + @Ensures({"result != null"}) + public static final GenotypesContext create(final int nGenotypes) { + return new GenotypesContext(nGenotypes); + } + + /** + * Create a fully resolved GenotypeContext containing genotypes, sample lookup table, + * and sorted sample names + * + * @param genotypes our genotypes in arbitrary + * @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its + * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that + * genotype in the vector of genotypes + * @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical + * order. + * @return an mutable GenotypeContext containing genotypes with already present lookup data + */ + @Requires({"genotypes != null", + "sampleNameToOffset != null", + "sampleNamesInOrder != null", + "sameSamples(genotypes, sampleNamesInOrder)", + "sameSamples(genotypes, sampleNameToOffset.keySet())"}) + @Ensures({"result != null"}) + public static final GenotypesContext create(final ArrayList genotypes, + final Map sampleNameToOffset, + final List sampleNamesInOrder) { + return new GenotypesContext(genotypes, sampleNameToOffset, sampleNamesInOrder); + } + + /** + * Create a fully resolved GenotypeContext containing genotypes + * + * @param genotypes our genotypes in arbitrary + * @return an mutable GenotypeContext containing genotypes + */ + @Requires({"genotypes != null"}) + @Ensures({"result != null"}) + public static final GenotypesContext create(final ArrayList genotypes) { + return genotypes == null ? NO_GENOTYPES : new GenotypesContext(genotypes); + } + + /** + * Create a fully resolved GenotypeContext containing genotypes + * + * @param genotypes our genotypes in arbitrary + * @return an mutable GenotypeContext containing genotypes + */ + @Requires({"genotypes != null"}) + @Ensures({"result != null"}) + public static final GenotypesContext create(final Genotype... genotypes) { + return create(new ArrayList(Arrays.asList(genotypes))); + } + + /** + * Create a freshly allocated GenotypeContext containing the genotypes in toCopy + * + * @param toCopy the GenotypesContext to copy + * @return an mutable GenotypeContext containing genotypes + */ + @Requires({"toCopy != null"}) + @Ensures({"result != null"}) + public static final GenotypesContext copy(final GenotypesContext toCopy) { + return create(new ArrayList(toCopy.getGenotypes())); + } + + /** + * Create a GenotypesContext containing the genotypes in iteration order contained + * in toCopy + * + * @param toCopy the collection of genotypes + * @return an mutable GenotypeContext containing genotypes + */ + @Ensures({"result != null"}) + public static final GenotypesContext copy(final Collection toCopy) { + return toCopy == null ? NO_GENOTYPES : create(new ArrayList(toCopy)); + } + + // --------------------------------------------------------------------------- + // + // Mutability methods + // + // --------------------------------------------------------------------------- + + public final GenotypesContext immutable() { + immutable = true; + return this; + } + + public boolean isMutable() { + return ! immutable; + } + + public final void checkImmutability() { + if ( immutable ) + throw new IllegalAccessError("GenotypeMap is currently immutable, but a mutator method was invoked on it"); + } + + // --------------------------------------------------------------------------- + // + // caches + // + // --------------------------------------------------------------------------- + + @Ensures({"sampleNameToOffset == null"}) + protected void invalidateSampleNameMap() { + sampleNameToOffset = null; + } + + @Ensures({"sampleNamesInOrder == null"}) + protected void invalidateSampleOrdering() { + sampleNamesInOrder = null; + } + + @Ensures({"sampleNamesInOrder != null", + "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)"}) + protected void ensureSampleOrdering() { + if ( sampleNamesInOrder == null ) { + sampleNamesInOrder = new ArrayList(size()); + + for ( int i = 0; i < size(); i++ ) { + sampleNamesInOrder.add(getGenotypes().get(i).getSampleName()); + } + Collections.sort(sampleNamesInOrder); + } + } + + @Ensures({"sampleNameToOffset != null", + "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"}) + protected void ensureSampleNameMap() { + if ( sampleNameToOffset == null ) { + sampleNameToOffset = new HashMap(size()); + + for ( int i = 0; i < size(); i++ ) { + sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i); + } + } + } + + // for testing purposes + protected void ensureAll() { + ensureSampleNameMap(); + ensureSampleOrdering(); + } + + // --------------------------------------------------------------------------- + // + // Map methods + // + // --------------------------------------------------------------------------- + + protected ArrayList getGenotypes() { + return notToBeDirectlyAccessedGenotypes; + } + + @Override + public void clear() { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + getGenotypes().clear(); + } + + @Override + public int size() { + return getGenotypes().size(); + } + + @Override + public boolean isEmpty() { + return getGenotypes().isEmpty(); + } + + /** + * Adds a single genotype to this context. + * + * There are many constraints on this input, and important + * impacts on the performance of other functions provided by this + * context. + * + * First, the sample name of genotype must be unique within this + * context. However, this is not enforced in the code itself, through + * you will invalid the contract on this context if you add duplicate + * samples and are running with CoFoJa enabled. + * + * Second, adding genotype also updates the sample name -> index map, + * so add() followed by containsSample and related function is an efficient + * series of operations. + * + * Third, adding the genotype invalidates the sorted list of sample names, to + * add() followed by any of the SampleNamesInOrder operations is inefficient, as + * each SampleNamesInOrder must rebuild the sorted list of sample names at + * an O(n log n) cost. + * + * @param genotype + * @return + */ + @Override + @Requires({"genotype != null", "get(genotype.getSampleName()) == null"}) + @Ensures("noDups(getGenotypes())") + public boolean add(final Genotype genotype) { + checkImmutability(); + invalidateSampleOrdering(); + + if ( sampleNameToOffset != null ) { + // update the name map by adding entries + sampleNameToOffset.put(genotype.getSampleName(), size()); + } + + return getGenotypes().add(genotype); + } + + @Override + @Requires("! contains(genotype)") + @Ensures("noDups(getGenotypes())") + public void add(final int i, final Genotype genotype) { + throw new UnsupportedOperationException(); + } + + /** + * Adds all of the genotypes to this context + * + * See {@link #add(Genotype)} for important information about this functions + * constraints and performance costs + * + * @param genotypes + * @return + */ + @Override + @Requires("! containsAny(genotypes)") + @Ensures("noDups(getGenotypes())") + public boolean addAll(final Collection genotypes) { + checkImmutability(); + invalidateSampleOrdering(); + + if ( sampleNameToOffset != null ) { + // update the name map by adding entries + int pos = size(); + for ( final Genotype g : genotypes ) { + sampleNameToOffset.put(g.getSampleName(), pos++); + } + } + + return getGenotypes().addAll(genotypes); + } + + @Override + public boolean addAll(final int i, final Collection genotypes) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean contains(final Object o) { + return getGenotypes().contains(o); + } + + @Override + public boolean containsAll(final Collection objects) { + return getGenotypes().containsAll(objects); + } + + private boolean containsAny(final Collection genotypes) { + for ( final Genotype g : genotypes ) { + if ( contains(g) ) return true; + } + return false; + } + + @Override + public Genotype get(final int i) { + return getGenotypes().get(i); + } + + /** + * Gets sample associated with this sampleName, or null if none is found + * + * @param sampleName + * @return + */ + public Genotype get(final String sampleName) { + Integer offset = getSampleI(sampleName); + return offset == null ? null : getGenotypes().get(offset); + } + + private Integer getSampleI(final String sampleName) { + ensureSampleNameMap(); + return sampleNameToOffset.get(sampleName); + } + + @Override + public int indexOf(final Object o) { + return getGenotypes().indexOf(o); + } + + @Override + public Iterator iterator() { + return getGenotypes().iterator(); + } + + @Override + public int lastIndexOf(final Object o) { + return getGenotypes().lastIndexOf(o); + } + + @Override + public ListIterator listIterator() { + // todo -- must be immutable + throw new UnsupportedOperationException(); +// return genotypes.listIterator(); + } + + @Override + public ListIterator listIterator(final int i) { + // todo -- must be immutable + throw new UnsupportedOperationException(); +// return genotypes.listIterator(i); + } + + /** + * Note that remove requires us to invalidate our sample -> index + * cache. The loop: + * + * GenotypesContext gc = ... + * for ( sample in samples ) + * if ( gc.containsSample(sample) ) + * gc.remove(sample) + * + * is extremely inefficient, as each call to remove invalidates the cache + * and containsSample requires us to rebuild it, an O(n) operation. + * + * If you must remove many samples from the GC, use either removeAll or retainAll + * to avoid this O(n * m) operation. + * + * @param i + * @return + */ + @Override + public Genotype remove(final int i) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().remove(i); + } + + /** + * See for important warning {@link this.remove(Integer)} + * @param o + * @return + */ + @Override + public boolean remove(final Object o) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().remove(o); + } + + @Override + public boolean removeAll(final Collection objects) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().removeAll(objects); + } + + @Override + public boolean retainAll(final Collection objects) { + checkImmutability(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); + return getGenotypes().retainAll(objects); + } + + @Override + @Ensures("noDups(getGenotypes())") + public Genotype set(final int i, final Genotype genotype) { + checkImmutability(); + final Genotype prev = getGenotypes().set(i, genotype); + + invalidateSampleOrdering(); + if ( sampleNameToOffset != null ) { + // update the name map by removing the old entry and replacing it with the new one + sampleNameToOffset.remove(prev.getSampleName()); + sampleNameToOffset.put(genotype.getSampleName(), i); + } + + return prev; + } + + /** + * Replaces the genotype in this context -- note for efficiency + * reasons we do not add the genotype if it's not present. The + * return value will be null indicating this happened. + * + * Note this operation is preserves the map cache Sample -> Offset but + * invalidates the sorted list of samples. Using replace within a loop + * containing any of the SampleNameInOrder operation requires an O(n log n) + * resorting after each replace operation. + * + * @param genotype a non null genotype to bind in this context + * @return null if genotype was not added, otherwise returns the previous genotype + */ + @Requires("genotype != null") + public Genotype replace(final Genotype genotype) { + checkImmutability(); + Integer offset = getSampleI(genotype.getSampleName()); + if ( offset == null ) + return null; + else + return set(offset, genotype); + } + + @Override + public List subList(final int i, final int i1) { + return getGenotypes().subList(i, i1); + } + + @Override + public Object[] toArray() { + return getGenotypes().toArray(); + } + + @Override + public T[] toArray(final T[] ts) { + return getGenotypes().toArray(ts); + } + + /** + * Iterate over the Genotypes in this context in the order specified by sampleNamesInOrder + * + * @param sampleNamesInOrder a Iterable of String, containing exactly one entry for each Genotype sample name in + * this context + * @return a Iterable over the genotypes in this context. + */ + @Requires("sampleNamesInOrder != null") + public Iterable iterateInSampleNameOrder(final Iterable sampleNamesInOrder) { + return new Iterable() { + @Override + public Iterator iterator() { + return new InOrderIterator(sampleNamesInOrder.iterator()); + } + }; + } + + /** + * Iterate over the Genotypes in this context in their sample name order (A, B, C) + * regardless of the underlying order in the vector of genotypes + * @return a Iterable over the genotypes in this context. + */ + public Iterable iterateInSampleNameOrder() { + return iterateInSampleNameOrder(getSampleNamesOrderedByName()); + } + + private final class InOrderIterator implements Iterator { + final Iterator sampleNamesInOrder; + + private InOrderIterator(final Iterator sampleNamesInOrder) { + this.sampleNamesInOrder = sampleNamesInOrder; + } + + @Override + public boolean hasNext() { + return sampleNamesInOrder.hasNext(); + } + + @Override + public Genotype next() { + return get(sampleNamesInOrder.next()); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + + /** + * @return The set of sample names for all genotypes in this context, in arbitrary order + */ + @Ensures("result != null") + public Set getSampleNames() { + ensureSampleNameMap(); + return sampleNameToOffset.keySet(); + } + + /** + * @return The set of sample names for all genotypes in this context, in their natural ordering (A, B, C) + */ + @Ensures("result != null") + public List getSampleNamesOrderedByName() { + ensureSampleOrdering(); + return sampleNamesInOrder; + } + + @Requires("sample != null") + public boolean containsSample(final String sample) { + ensureSampleNameMap(); + return sampleNameToOffset.containsKey(sample); + } + + @Requires("samples != null") + public boolean containsSamples(final Collection samples) { + return getSampleNames().containsAll(samples); + } + + /** + * Return a freshly allocated subcontext of this context containing only the samples + * listed in samples. Note that samples can contain names not in this context, they + * will just be ignored. + * + * @param samples + * @return + */ + @Requires("samples != null") + @Ensures("result != null") + public GenotypesContext subsetToSamples( final Set samples ) { + final int nSamples = samples.size(); + + if ( nSamples == 0 ) + return NO_GENOTYPES; + else { // nGenotypes < nSamples + final GenotypesContext subset = create(samples.size()); + for ( final String sample : samples ) { + final Genotype g = get(sample); + if ( g != null ) + subset.add(g); + } + return subset; + } + } + + @Override + public String toString() { + final List gS = new ArrayList(); + for ( final Genotype g : this.iterateInSampleNameOrder() ) + gS.add(g.toString()); + return "[" + join(",", gS) + "]"; + } + + // copied from Utils + private static String join(final String separator, final Collection objects) { + if (objects.isEmpty()) { // fast path for empty collection + return ""; + } else { + final Iterator iter = objects.iterator(); + final T first = iter.next(); + + if ( ! iter.hasNext() ) // fast path for singleton collections + return first.toString(); + else { // full path for 2+ collection that actually need a join + final StringBuilder ret = new StringBuilder(first.toString()); + while(iter.hasNext()) { + ret.append(separator); + ret.append(iter.next().toString()); + } + return ret.toString(); + } + } + } + + protected final static boolean noDups(Collection genotypes) { + Set names = new HashSet(genotypes.size()); + for ( final Genotype g : genotypes ) { + if ( names.contains(g.getSampleName()) ) + return false; + names.add(g.getSampleName()); + } + + return true; + } + + protected final static boolean sameSamples(List genotypes, Collection sampleNamesInOrder) { + Set names = new HashSet(sampleNamesInOrder); + if ( names.size() != sampleNamesInOrder.size() ) + return false; + if ( genotypes.size() != names.size() ) + return false; + + for ( final Genotype g : genotypes ) + if ( ! names.contains(g.getSampleName()) ) + return false; + + return true; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java new file mode 100644 index 0000000000..ce0422352d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Lazy-loading GenotypesContext. A lazy-loading context has access to the + * VCFParser and a unparsed string of genotype data. If the user attempts to manipulate + * the genotypes contained in this context, we decode the data and become a full blown + * GenotypesContext. However, if the user never does this we are spared a lot of expense + * decoding the genotypes unnecessarily. + */ +public class LazyGenotypesContext extends GenotypesContext { + /** The LazyParser we'll use to decode unparsedGenotypeData if necessary */ + final LazyParser parser; + + Object unparsedGenotypeData; + + /** + * nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data + * (known already in the parser). Useful for isEmpty and size() optimizations + */ + final int nUnparsedGenotypes; + + /** + * True if we've already decoded the values in unparsedGenotypeData + */ + boolean loaded = false; + + private final static ArrayList EMPTY = new ArrayList(0); + + /** + * Simple lazy parser interface. Provide an object implementing this + * interface to LazyGenotypesContext, and it's parse method will be called + * when the use of the lazy context requires the underlying genotypes data + * be parsed into Genotype objects. The data argument is the data provided + * to the LazyGenotypesContext holding encoded genotypes data + */ + public interface LazyParser { + @Requires("data != null") + @Ensures("result != null") + public LazyData parse(Object data); + } + + /** + * Returns the data used in the full GenotypesContext constructor + * + * {@link GenotypesContext#GenotypesContext(java.util.ArrayList, java.util.Map, java.util.List)} + */ + public static class LazyData { + final ArrayList genotypes; + final Map sampleNameToOffset; + final List sampleNamesInOrder; + + @Requires({"genotypes != null", "sampleNamesInOrder != null", "sampleNameToOffset != null", + "sameSamples(genotypes, sampleNamesInOrder)", + "sameSamples(genotypes, sampleNameToOffset.keySet())"}) + public LazyData(final ArrayList genotypes, + final List sampleNamesInOrder, + final Map sampleNameToOffset) { + this.genotypes = genotypes; + this.sampleNamesInOrder = sampleNamesInOrder; + this.sampleNameToOffset = sampleNameToOffset; + } + } + + /** + * Creates a new lazy loading genotypes context using the LazyParser to create + * genotypes data on demand. + * + * @param parser the parser to be used to load on-demand genotypes data + * @param unparsedGenotypeData the encoded genotypes data that we will decode if necessary + * @param nUnparsedGenotypes the number of genotypes that will be produced if / when we actually decode the genotypes data + */ + @Requires({"parser != null", "unparsedGenotypeData != null", "nUnparsedGenotypes >= 0"}) + public LazyGenotypesContext(final LazyParser parser, final Object unparsedGenotypeData, final int nUnparsedGenotypes) { + super(EMPTY); + this.parser = parser; + this.unparsedGenotypeData = unparsedGenotypeData; + this.nUnparsedGenotypes = nUnparsedGenotypes; + } + + /** + * Overrides the genotypes accessor. If we haven't already, decode the genotypes data + * and store the decoded results in the appropriate variables. Otherwise we just + * returned the decoded result directly. Note some care needs to be taken here as + * the value in notToBeDirectlyAccessedGenotypes may diverge from what would be produced + * by decode, if after the first decode the genotypes themselves are replaced + * @return + */ + @Override + @Ensures("result != null") + protected ArrayList getGenotypes() { + decode(); + return notToBeDirectlyAccessedGenotypes; + } + + /** + * Force us to decode the genotypes, if not already done + */ + public void decode() { + if ( ! loaded ) { + //System.out.printf("Loading genotypes... %s:%d%n", contig, start); + LazyData parsed = parser.parse(unparsedGenotypeData); + notToBeDirectlyAccessedGenotypes = parsed.genotypes; + sampleNamesInOrder = parsed.sampleNamesInOrder; + sampleNameToOffset = parsed.sampleNameToOffset; + loaded = true; + unparsedGenotypeData = null; // don't hold the unparsed data any longer + + // warning -- this path allows us to create a VariantContext that doesn't run validateGenotypes() + // That said, it's not such an important routine -- it's just checking that the genotypes + // are well formed w.r.t. the alleles list, but this will be enforced within the VCFCodec + } + } + + /** + * Overrides the ensure* functionality. If the data hasn't been loaded + * yet and we want to build the cache, just decode it and we're done. If we've + * already decoded the data, though, go through the super class + */ + @Override + protected synchronized void ensureSampleNameMap() { + if ( ! loaded ) { + decode(); // will load up all of the necessary data + } else { + super.ensureSampleNameMap(); + } + } + + @Override + protected synchronized void ensureSampleOrdering() { + if ( ! loaded ) { + decode(); // will load up all of the necessary data + } else { + super.ensureSampleOrdering(); + } + } + + @Override + protected void invalidateSampleNameMap() { + // if the cache is invalidated, and we haven't loaded our data yet, do so + if ( ! loaded ) decode(); + super.invalidateSampleNameMap(); + } + + @Override + protected void invalidateSampleOrdering() { + // if the cache is invalidated, and we haven't loaded our data yet, do so + if ( ! loaded ) decode(); + super.invalidateSampleOrdering(); + } + + @Override + public boolean isEmpty() { + // optimization -- we know the number of samples in the unparsed data, so use it here to + // avoid parsing just to know if the genotypes context is empty + return loaded ? super.isEmpty() : nUnparsedGenotypes == 0; + } + + @Override + public int size() { + // optimization -- we know the number of samples in the unparsed data, so use it here to + // avoid parsing just to know the size of the context + return loaded ? super.size() : nUnparsedGenotypes; + } + + public Object getUnparsedGenotypeData() { + return unparsedGenotypeData; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java deleted file mode 100755 index 14419a2a0d..0000000000 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext; - -import java.util.*; - -/** - * This class emcompasses all the basic information about a genotype. It is immutable. - * - * @author Mark DePristo - */ -public class MutableGenotype extends Genotype { - public MutableGenotype(Genotype parent) { - super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); - } - - public MutableGenotype(String sampleName, Genotype parent) { - super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased()); - } - - - public MutableGenotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { - super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased); - } - - public MutableGenotype(String sampleName, List alleles, double negLog10PError) { - super(sampleName, alleles, negLog10PError); - } - - public MutableGenotype(String sampleName, List alleles) { - super(sampleName, alleles); - } - - public Genotype unmodifiableGenotype() { - return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), isPhased()); - } - - - /** - * - * @param alleles list of alleles - */ - public void setAlleles(List alleles) { - this.alleles = new ArrayList(alleles); - validate(); - } - - public void setPhase(boolean isPhased) { - super.isPhased = isPhased; - } - - // --------------------------------------------------------------------------------------------------------- - // - // InferredGeneticContext mutation operators - // - // --------------------------------------------------------------------------------------------------------- - public void setName(String name) { commonInfo.setName(name); } - public void addFilter(String filter) { commonInfo.addFilter(filter); } - public void addFilters(Collection filters) { commonInfo.addFilters(filters); } - public void clearFilters() { commonInfo.clearFilters(); } - public void setFilters(Collection filters) { commonInfo.setFilters(filters); } - public void setAttributes(Map map) { commonInfo.setAttributes(map); } - public void clearAttributes() { commonInfo.clearAttributes(); } - public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } - public void removeAttribute(String key) { commonInfo.removeAttribute(key); } - public void putAttributes(Map map) { commonInfo.putAttributes(map); } - public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } - public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java deleted file mode 100755 index a752f4a1b1..0000000000 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java +++ /dev/null @@ -1,213 +0,0 @@ -package org.broadinstitute.sting.utils.variantcontext; - - -import java.util.Collection; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; - -/** - * Mutable version of VariantContext - * - * @author depristo - */ -public class MutableVariantContext extends VariantContext { - // --------------------------------------------------------------------------------------------------------- - // - // constructors - // - // --------------------------------------------------------------------------------------------------------- - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles) { - super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - public MutableVariantContext(VariantContext parent) { - super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel()); - } - - /** - * Sets the alleles segregating in this context to the collect of alleles. Each of which must be unique according - * to equals() in Allele. Validate() should be called when you are done modifying the context. - * - * @param alleles - */ - public void setAlleles(Collection alleles) { - this.alleles.clear(); - for ( Allele a : alleles ) - addAllele(a); - } - - /** - * Adds allele to the segregating allele list in this context to the collection of alleles. The new - * allele must be be unique according to equals() in Allele. - * Validate() should be called when you are done modifying the context. - * - * @param allele - */ - public void addAllele(Allele allele) { - final boolean allowDuplicates = false; // used to be a parameter - - type = null; - - for ( Allele a : alleles ) { - if ( a.basesMatch(allele) && ! allowDuplicates ) - throw new IllegalArgumentException("Duplicate allele added to VariantContext" + this); - } - - // we are a novel allele - alleles.add(allele); - } - - public void clearGenotypes() { - genotypes = new TreeMap(); - } - - /** - * Adds this single genotype to the context, not allowing duplicate genotypes to be added - * @param genotype - */ - public void addGenotypes(Genotype genotype) { - putGenotype(genotype.getSampleName(), genotype, false); - } - - /** - * Adds these genotypes to the context, not allowing duplicate genotypes to be added - * @param genotypes - */ - public void addGenotypes(Collection genotypes) { - for ( Genotype g : genotypes ) { - addGenotype(g); - } - } - - /** - * Adds these genotype to the context, not allowing duplicate genotypes to be added. - * @param genotypes - */ - public void addGenotypes(Map genotypes) { - - for ( Map.Entry elt : genotypes.entrySet() ) { - addGenotype(elt.getValue()); - } - } - - /** - * Adds these genotypes to the context. - * - * @param genotypes - */ - public void putGenotypes(Map genotypes) { - for ( Map.Entry g : genotypes.entrySet() ) - putGenotype(g.getKey(), g.getValue()); - } - - /** - * Adds these genotypes to the context. - * - * @param genotypes - */ - public void putGenotypes(Collection genotypes) { - for ( Genotype g : genotypes ) - putGenotype(g); - } - - /** - * Adds this genotype to the context, throwing an error if it's already bound. - * - * @param genotype - */ - public void addGenotype(Genotype genotype) { - addGenotype(genotype.getSampleName(), genotype); - } - - /** - * Adds this genotype to the context, throwing an error if it's already bound. - * - * @param genotype - */ - public void addGenotype(String sampleName, Genotype genotype) { - putGenotype(sampleName, genotype, false); - } - - /** - * Adds this genotype to the context. - * - * @param genotype - */ - public void putGenotype(Genotype genotype) { - putGenotype(genotype.getSampleName(), genotype); - } - - /** - * Adds this genotype to the context. - * - * @param genotype - */ - public void putGenotype(String sampleName, Genotype genotype) { - putGenotype(sampleName, genotype, true); - } - - private void putGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) { - if ( hasGenotype(sampleName) && ! allowOverwrites ) - throw new IllegalStateException("Attempting to overwrite sample->genotype binding: " + sampleName + " this=" + this); - - if ( ! sampleName.equals(genotype.getSampleName()) ) - throw new IllegalStateException("Sample name doesn't equal genotype.getSample(): " + sampleName + " genotype=" + genotype); - - this.genotypes.put(sampleName, genotype); - } - - /** - * Removes the binding from sampleName to genotype. If this doesn't exist, throws an IllegalArgumentException - * @param sampleName - */ - public void removeGenotype(String sampleName) { - if ( ! this.genotypes.containsKey(sampleName) ) - throw new IllegalArgumentException("Sample name isn't contained in genotypes " + sampleName + " genotypes =" + genotypes); - - this.genotypes.remove(sampleName); - } - - /** - * Removes genotype from the context. If this doesn't exist, throws an IllegalArgumentException - * @param genotype - */ - public void removeGenotype(Genotype genotype) { - removeGenotype(genotype.getSampleName()); - } - - // todo -- add replace genotype routine - - // --------------------------------------------------------------------------------------------------------- - // - // InferredGeneticContext mutation operators - // - // --------------------------------------------------------------------------------------------------------- - - public void setSource(String source) { commonInfo.setName(source); } - public void addFilter(String filter) { commonInfo.addFilter(filter); } - public void addFilters(Collection filters) { commonInfo.addFilters(filters); } - public void clearFilters() { commonInfo.clearFilters(); } - public void setFilters(Collection filters) { commonInfo.setFilters(filters); } - public void setAttributes(Map map) { commonInfo.setAttributes(map); } - public void clearAttributes() { commonInfo.clearAttributes(); } - public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); } - public void removeAttribute(String key) { commonInfo.removeAttribute(key); } - public void putAttributes(Map map) { commonInfo.putAttributes(map); } - public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); } - public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); } - public void setID(String id) { putAttribute(ID_KEY, id, true); } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index f52a7087b0..247e412ddb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -4,7 +4,6 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -130,17 +129,17 @@ * *

  * vc.hasGenotypes()
- * vc.isMonomorphic()
- * vc.isPolymorphic()
+ * vc.isMonomorphicInSamples()
+ * vc.isPolymorphicInSamples()
  * vc.getSamples().size()
  *
  * vc.getGenotypes()
  * vc.getGenotypes().get("g1")
  * vc.hasGenotype("g1")
  *
- * vc.getChromosomeCount()
- * vc.getChromosomeCount(Aref)
- * vc.getChromosomeCount(T)
+ * vc.getCalledChrCount()
+ * vc.getCalledChrCount(Aref)
+ * vc.getCalledChrCount(T)
  * 
* * === NO_CALL alleles === @@ -162,20 +161,21 @@ * @author depristo */ public class VariantContext implements Feature { // to enable tribble intergration - protected InferredGeneticContext commonInfo = null; - public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_"; - public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_"; - public final static String ID_KEY = "ID"; + protected CommonInfo commonInfo = null; + public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; + + @Deprecated // ID is no longer stored in the attributes map + private final static String ID_KEY = "ID"; private final Byte REFERENCE_BASE_FOR_INDEL; public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); /** The location of this VariantContext */ - protected String contig; - protected long start; - protected long stop; + final protected String contig; + final protected long start; + final protected long stop; + private final String ID; /** The type (cached for performance reasons) of this context */ protected Type type = null; @@ -184,12 +184,12 @@ public class VariantContext implements Feature { // to enable tribble intergrati final protected List alleles; /** A mapping from sampleName -> genotype objects for all genotypes associated with this context */ - protected Map genotypes = null; + protected GenotypesContext genotypes = null; /** Counts for each of the possible Genotype types in this context */ protected int[] genotypeCounts = null; - public final static Map NO_GENOTYPES = Collections.unmodifiableMap(new HashMap()); + public final static GenotypesContext NO_GENOTYPES = GenotypesContext.NO_GENOTYPES; // a fast cached access point to the ref / alt alleles for biallelic case private Allele REF = null; @@ -197,124 +197,41 @@ public class VariantContext implements Feature { // to enable tribble intergrati // set to the alt allele when biallelic, otherwise == null private Allele ALT = null; - // were filters applied? - private boolean filtersWereAppliedToContext; + /* cached monomorphic value: null -> not yet computed, False, True */ + private Boolean monomorphic = null; // --------------------------------------------------------------------------------------------------------- // - // constructors + // validation mode // // --------------------------------------------------------------------------------------------------------- - - /** - * the complete constructor. Makes a complete VariantContext from its arguments - * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES. - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param referenceBaseForIndel padded reference base - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false); - } - - /** - * the complete constructor. Makes a complete VariantContext from its arguments - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false); - } - - /** - * Makes a VariantContext from its arguments without parsing the genotypes. - * Note that this constructor assumes that if there is genotype data, then it's been put into - * the attributes with the UNPARSED_GENOTYPE_MAP_KEY and that the codec has been added with the - * UNPARSED_GENOTYPE_PARSER_KEY. It doesn't validate that this is the case because it's possible - * that there is no genotype data. - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param referenceBaseForIndel padded reference base - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true); + public enum Validation { + REF_PADDING, + ALLELES, + GENOTYPES } - /** - * Create a new VariantContext - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes set - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false); - } + private final static EnumSet ALL_VALIDATION = EnumSet.allOf(Validation.class); + private final static EnumSet NO_VALIDATION = EnumSet.noneOf(Validation.class); - /** - * Create a new variant context with genotypes but without Perror, filters, and attributes - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes - */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - this(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } + // --------------------------------------------------------------------------------------------------------- + // + // constructors: see VariantContextBuilder + // + // --------------------------------------------------------------------------------------------------------- /** * Copy constructor * * @param other the VariantContext to copy */ - public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false); + protected VariantContext(VariantContext other) { + this(other.getSource(), other.getID(), other.getChr(), other.getStart(), other.getEnd(), + other.getAlleles(), other.getGenotypes(), other.getLog10PError(), + other.getFiltersMaybeNull(), + other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, + NO_VALIDATION); } /** @@ -326,40 +243,44 @@ public VariantContext(VariantContext other) { * @param stop the stop reference base (one based) * @param alleles alleles * @param genotypes genotypes map - * @param negLog10PError qual + * @param log10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes * @param referenceBaseForIndel padded reference base - * @param genotypesAreUnparsed true if the genotypes have not yet been parsed + * @param validationToPerform set of validation steps to take */ - private VariantContext(String source, String contig, long start, long stop, - Collection alleles, Map genotypes, - double negLog10PError, Set filters, Map attributes, - Byte referenceBaseForIndel, boolean genotypesAreUnparsed) { + protected VariantContext(String source, String ID, + String contig, long start, long stop, + Collection alleles, GenotypesContext genotypes, + double log10PError, Set filters, Map attributes, + Byte referenceBaseForIndel, + EnumSet validationToPerform ) { if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } this.contig = contig; this.start = start; this.stop = stop; - if ( !genotypesAreUnparsed && attributes != null ) { - if ( attributes.containsKey(UNPARSED_GENOTYPE_MAP_KEY) ) - attributes.remove(UNPARSED_GENOTYPE_MAP_KEY); - if ( attributes.containsKey(UNPARSED_GENOTYPE_PARSER_KEY) ) - attributes.remove(UNPARSED_GENOTYPE_PARSER_KEY); - } + // intern for efficiency. equals calls will generate NPE if ID is inappropriately passed in as null + if ( ID == null || ID.equals("") ) throw new IllegalArgumentException("ID field cannot be the null or the empty string"); + this.ID = ID.equals(VCFConstants.EMPTY_ID_FIELD) ? VCFConstants.EMPTY_ID_FIELD : ID; - this.commonInfo = new InferredGeneticContext(source, negLog10PError, filters, attributes); - filtersWereAppliedToContext = filters != null; + this.commonInfo = new CommonInfo(source, log10PError, filters, attributes); REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel; + // todo -- remove me when this check is no longer necessary + if ( this.commonInfo.hasAttribute(ID_KEY) ) + throw new IllegalArgumentException("Trying to create a VariantContext with a ID key. Please use provided constructor argument ID"); + if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles this.alleles = makeAlleles(alleles); - - if ( genotypes == null ) { genotypes = NO_GENOTYPES; } - this.genotypes = Collections.unmodifiableMap(genotypes); + if ( genotypes == null || genotypes == NO_GENOTYPES ) { + this.genotypes = NO_GENOTYPES; + } else { + this.genotypes = genotypes.immutable(); + } // cache the REF and ALT alleles int nAlleles = alleles.size(); @@ -371,39 +292,9 @@ private VariantContext(String source, String contig, long start, long stop, } } - validate(); - } - - // --------------------------------------------------------------------------------------------------------- - // - // Partial-cloning routines (because Variant Context is immutable). - // Note that we don't call vc.getGenotypes() because that triggers the lazy loading. - // Also note that we need to create a new attributes map because it's unmodifiable and the constructor may try to modify it. - // - // --------------------------------------------------------------------------------------------------------- - - public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false); - } - - public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); - } - - public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); - } - - public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true); - } - - public static VariantContext modifyReferencePadding(VariantContext vc, Byte b) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true); - } - - public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true); + if ( ! validationToPerform.isEmpty() ) { + validate(validationToPerform); + } } // --------------------------------------------------------------------------------------------------------- @@ -412,55 +303,32 @@ public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, // // --------------------------------------------------------------------------------------------------------- - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotype - * genotype and alleles in genotype. This is the right way to test if a single genotype is actually - * variant or not. - * - * @param genotype genotype - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Genotype genotype) { - return subContextFromGenotypes(Arrays.asList(genotype)); + public VariantContext subContextFromSamples(Set sampleNames, Collection alleles) { + VariantContextBuilder builder = new VariantContextBuilder(this); + return builder.genotypes(genotypes.subsetToSamples(sampleNames)).alleles(alleles).make(); } - - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes - * genotypes and alleles in these genotypes. This is the right way to test if a single genotype is actually - * variant or not. - * - * @param genotypes genotypes - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Collection genotypes) { - return subContextFromGenotypes(genotypes, allelesOfGenotypes(genotypes)) ; + public VariantContext subContextFromSamples(Set sampleNames) { + VariantContextBuilder builder = new VariantContextBuilder(this); + GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames); + return builder.genotypes(newGenotypes).alleles(allelesOfGenotypes(newGenotypes)).make(); } - /** - * Returns a context identical to this (i.e., filter, qual are all the same) but containing only the Genotypes - * genotypes. Also, the resulting variant context will contain the alleles provided, not only those found in genotypes - * - * @param genotypes genotypes - * @param alleles the set of allele segregating alleles at this site. Must include those in genotypes, but may be more - * @return vc subcontext - */ - public VariantContext subContextFromGenotypes(Collection genotypes, Collection alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); + public VariantContext subContextFromSample(String sampleName) { + return subContextFromSamples(Collections.singleton(sampleName)); } - /** * helper routine for subcontext * @param genotypes genotypes * @return allele set */ - private Set allelesOfGenotypes(Collection genotypes) { - Set alleles = new HashSet(); + private final Set allelesOfGenotypes(Collection genotypes) { + final Set alleles = new HashSet(); boolean addedref = false; - for ( Genotype g : genotypes ) { - for ( Allele a : g.getAlleles() ) { + for ( final Genotype g : genotypes ) { + for ( final Allele a : g.getAlleles() ) { addedref = addedref || a.isReference(); if ( a.isCalled() ) alleles.add(a); @@ -628,11 +496,15 @@ public boolean isMNP() { // --------------------------------------------------------------------------------------------------------- public boolean hasID() { - return commonInfo.hasAttribute(ID_KEY); + return getID() != VCFConstants.EMPTY_ID_FIELD; + } + + public boolean emptyID() { + return ! hasID(); } public String getID() { - return (String)commonInfo.getAttribute(ID_KEY); + return ID; } public boolean hasReferenceBaseForIndel() { @@ -650,12 +522,13 @@ public Byte getReferenceBaseForIndel() { // // --------------------------------------------------------------------------------------------------------- public String getSource() { return commonInfo.getName(); } + public Set getFiltersMaybeNull() { return commonInfo.getFiltersMaybeNull(); } public Set getFilters() { return commonInfo.getFilters(); } public boolean isFiltered() { return commonInfo.isFiltered(); } public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } - public boolean filtersWereApplied() { return filtersWereAppliedToContext; } - public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } - public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); } + public boolean hasLog10PError() { return commonInfo.hasLog10PError(); } + public double getLog10PError() { return commonInfo.getLog10PError(); } public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } public Map getAttributes() { return commonInfo.getAttributes(); } @@ -811,35 +684,10 @@ public boolean hasSameAlternateAllelesAs ( VariantContext other ) { // // --------------------------------------------------------------------------------------------------------- - private void loadGenotypes() { - if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) { - if ( genotypes == null ) - genotypes = NO_GENOTYPES; - return; - } - - Object parserObj = getAttribute(UNPARSED_GENOTYPE_PARSER_KEY); - if ( parserObj == null || !(parserObj instanceof VCFParser) ) - throw new IllegalStateException("There is no VCF parser stored to unparse the genotype data"); - VCFParser parser = (VCFParser)parserObj; - - Object mapObj = getAttribute(UNPARSED_GENOTYPE_MAP_KEY); - if ( mapObj == null ) - throw new IllegalStateException("There is no mapping string stored to unparse the genotype data"); - - genotypes = parser.createGenotypeMap(mapObj.toString(), new ArrayList(alleles), getChr(), getStart()); - - commonInfo.removeAttribute(UNPARSED_GENOTYPE_MAP_KEY); - commonInfo.removeAttribute(UNPARSED_GENOTYPE_PARSER_KEY); - - validateGenotypes(); - } - /** * @return the number of samples in the context */ public int getNSamples() { - loadGenotypes(); return genotypes.size(); } @@ -847,31 +695,26 @@ public int getNSamples() { * @return true if the context has associated genotypes */ public boolean hasGenotypes() { - loadGenotypes(); - return genotypes.size() > 0; + return ! genotypes.isEmpty(); } public boolean hasGenotypes(Collection sampleNames) { - loadGenotypes(); - for ( String name : sampleNames ) { - if ( ! genotypes.containsKey(name) ) - return false; - } - return true; + return genotypes.containsSamples(sampleNames); } /** * @return set of all Genotypes associated with this context */ - public Map getGenotypes() { - loadGenotypes(); + public GenotypesContext getGenotypes() { return genotypes; } - public List getGenotypesSortedByName() { - loadGenotypes(); - Collection types = new TreeMap(genotypes).values(); - return new ArrayList(types); + public Iterable getGenotypesOrderedByName() { + return genotypes.iterateInSampleNameOrder(); + } + + public Iterable getGenotypesOrderedBy(Iterable sampleOrdering) { + return genotypes.iterateInSampleNameOrder(sampleOrdering); } /** @@ -882,37 +725,38 @@ public List getGenotypesSortedByName() { * @return * @throws IllegalArgumentException if sampleName isn't bound to a genotype */ - public Map getGenotypes(String sampleName) { - return getGenotypes(Arrays.asList(sampleName)); + public GenotypesContext getGenotypes(String sampleName) { + return getGenotypes(Collections.singleton(sampleName)); } /** * Returns a map from sampleName -> Genotype for each sampleName in sampleNames. Returns a map * for consistency with the multi-get function. * + * For testing convenience only + * * @param sampleNames a unique list of sample names * @return * @throws IllegalArgumentException if sampleName isn't bound to a genotype */ - public Map getGenotypes(Collection sampleNames) { - HashMap map = new HashMap(); - - for ( String name : sampleNames ) { - if ( map.containsKey(name) ) throw new IllegalArgumentException("Duplicate names detected in requested samples " + sampleNames); - final Genotype g = getGenotype(name); - if ( g != null ) { - map.put(name, g); - } - } + protected GenotypesContext getGenotypes(Collection sampleNames) { + return getGenotypes().subsetToSamples(new HashSet(sampleNames)); + } - return map; + public GenotypesContext getGenotypes(Set sampleNames) { + return getGenotypes().subsetToSamples(sampleNames); } + /** - * @return the set of all sample names in this context + * @return the set of all sample names in this context, not ordered */ public Set getSampleNames() { - return getGenotypes().keySet(); + return getGenotypes().getSampleNames(); + } + + public List getSampleNamesOrderedByName() { + return getGenotypes().getSampleNamesOrderedByName(); } /** @@ -925,24 +769,25 @@ public Genotype getGenotype(String sample) { } public boolean hasGenotype(String sample) { - return getGenotypes().containsKey(sample); + return getGenotypes().containsSample(sample); } public Genotype getGenotype(int ith) { - return getGenotypesSortedByName().get(ith); + return genotypes.get(ith); } /** - * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS + * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS) * * @return chromosome count */ - public int getChromosomeCount() { + public int getCalledChrCount() { int n = 0; - for ( Genotype g : getGenotypes().values() ) { - n += g.isNoCall() ? 0 : g.getPloidy(); + for ( final Genotype g : getGenotypes() ) { + for ( final Allele a : g.getAlleles() ) + n += a.isNoCall() ? 0 : 1; } return n; @@ -954,10 +799,10 @@ public int getChromosomeCount() { * @param a allele * @return chromosome count */ - public int getChromosomeCount(Allele a) { + public int getCalledChrCount(Allele a) { int n = 0; - for ( Genotype g : getGenotypes().values() ) { + for ( final Genotype g : getGenotypes() ) { n += g.getAlleles(a).size(); } @@ -970,8 +815,10 @@ public int getChromosomeCount(Allele a) { * * @return true if it's monomorphic */ - public boolean isMonomorphic() { - return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples()); + public boolean isMonomorphicInSamples() { + if ( monomorphic == null ) + monomorphic = ! isVariant() || (hasGenotypes() && getCalledChrCount(getReference()) == getCalledChrCount()); + return monomorphic; } /** @@ -980,25 +827,16 @@ public boolean isMonomorphic() { * * @return true if it's polymorphic */ - public boolean isPolymorphic() { - return ! isMonomorphic(); + public boolean isPolymorphicInSamples() { + return ! isMonomorphicInSamples(); } private void calculateGenotypeCounts() { if ( genotypeCounts == null ) { genotypeCounts = new int[Genotype.Type.values().length]; - for ( Genotype g : getGenotypes().values() ) { - if ( g.isNoCall() ) - genotypeCounts[Genotype.Type.NO_CALL.ordinal()]++; - else if ( g.isHomRef() ) - genotypeCounts[Genotype.Type.HOM_REF.ordinal()]++; - else if ( g.isHet() ) - genotypeCounts[Genotype.Type.HET.ordinal()]++; - else if ( g.isHomVar() ) - genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]++; - else - genotypeCounts[Genotype.Type.MIXED.ordinal()]++; + for ( final Genotype g : getGenotypes() ) { + genotypeCounts[g.getType().ordinal()]++; } } } @@ -1094,7 +932,7 @@ public void validateReferenceBases(Allele reference, Byte paddedRefBase) { } public void validateRSIDs(Set rsIDs) { - if ( rsIDs != null && hasAttribute(VariantContext.ID_KEY) ) { + if ( rsIDs != null && hasID() ) { for ( String id : getID().split(VCFConstants.ID_FIELD_SEPARATOR) ) { if ( id.startsWith("rs") && !rsIDs.contains(id) ) throw new TribbleException.InternalCodecException(String.format("the rsID %s for the record at position %s:%d is not in dbSNP", id, getChr(), getStart())); @@ -1109,7 +947,7 @@ public void validateAlternateAlleles() { List reportedAlleles = getAlleles(); Set observedAlleles = new HashSet(); observedAlleles.add(getReference()); - for ( Genotype g : getGenotypes().values() ) { + for ( final Genotype g : getGenotypes() ) { if ( g.isCalled() ) observedAlleles.addAll(g.getAlleles()); } @@ -1125,19 +963,28 @@ public void validateAlternateAlleles() { } public void validateChromosomeCounts() { - Map observedAttrs = calculateChromosomeCounts(); - // AN if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString()); - int observedAN = (Integer)observedAttrs.get(VCFConstants.ALLELE_NUMBER_KEY); + int observedAN = getCalledChrCount(); if ( reportedAN != observedAN ) throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAN, observedAN)); } // AC if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { - List observedACs = (List)observedAttrs.get(VCFConstants.ALLELE_COUNT_KEY); + ArrayList observedACs = new ArrayList(); + + // if there are alternate alleles, record the relevant tags + if ( getAlternateAlleles().size() > 0 ) { + for ( Allele allele : getAlternateAlleles() ) { + observedACs.add(getCalledChrCount(allele)); + } + } + else { // otherwise, set them to 0 + observedACs.add(0); + } + if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) { Collections.sort(observedACs); List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY); @@ -1158,54 +1005,20 @@ public void validateChromosomeCounts() { } } - private Map calculateChromosomeCounts() { - Map attributes = new HashMap(); - - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, getChromosomeCount()); - ArrayList alleleFreqs = new ArrayList(); - ArrayList alleleCounts = new ArrayList(); - - // if there are alternate alleles, record the relevant tags - if ( getAlternateAlleles().size() > 0 ) { - for ( Allele allele : getAlternateAlleles() ) { - alleleCounts.add(getChromosomeCount(allele)); - alleleFreqs.add((double)getChromosomeCount(allele) / (double)getChromosomeCount()); - } - } - // otherwise, set them to 0 - else { - alleleCounts.add(0); - alleleFreqs.add(0.0); - } - - attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs); - return attributes; - } - // --------------------------------------------------------------------------------------------------------- // // validation: the normal validation routines are called automatically upon creation of the VC // // --------------------------------------------------------------------------------------------------------- - /** - * To be called by any modifying routines - */ - private boolean validate() { - return validate(true); - } - - private boolean validate(boolean throwException) { - try { - validateReferencePadding(); - validateAlleles(); - validateGenotypes(); - } catch ( IllegalArgumentException e ) { - if ( throwException ) - throw e; - else - return false; + private boolean validate(final EnumSet validationToPerform) { + for (final Validation val : validationToPerform ) { + switch (val) { + case ALLELES: validateAlleles(); break; + case REF_PADDING: validateReferencePadding(); break; + case GENOTYPES: validateGenotypes(); break; + default: throw new IllegalArgumentException("Unexpected validation mode " + val); + } } return true; @@ -1258,12 +1071,7 @@ private void validateAlleles() { private void validateGenotypes() { if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null"); - for ( Map.Entry elt : this.genotypes.entrySet() ) { - String name = elt.getKey(); - Genotype g = elt.getValue(); - - if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); - + for ( final Genotype g : this.genotypes ) { if ( g.isAvailable() ) { for ( Allele gAllele : g.getAlleles() ) { if ( ! hasAllele(gAllele) && gAllele.isCalled() ) @@ -1352,7 +1160,9 @@ private static Type typeOfBiallelicVariant(Allele ref, Allele allele) { public String toString() { return String.format("[VC %s @ %s of type=%s alleles=%s attr=%s GT=%s", getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop), this.getType(), - ParsingUtils.sortList(this.getAlleles()), ParsingUtils.sortedString(this.getAttributes()), this.getGenotypesSortedByName()); + ParsingUtils.sortList(this.getAlleles()), + ParsingUtils.sortedString(this.getAttributes()), + this.getGenotypes()); } // protected basic manipulation routines @@ -1386,16 +1196,6 @@ private static List makeAlleles(Collection alleles) { return alleleList; } - public static Map genotypeCollectionToMap(Map dest, Collection genotypes) { - for ( Genotype g : genotypes ) { - if ( dest.containsKey(g.getSampleName() ) ) - throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g); - dest.put(g.getSampleName(), g); - } - - return dest; - } - // --------------------------------------------------------------------------------------------------------- // // tribble integration routines -- not for public consumption @@ -1413,8 +1213,8 @@ public int getEnd() { return (int)stop; } - private boolean hasSymbolicAlleles() { - for (Allele a: getAlleles()) { + public boolean hasSymbolicAlleles() { + for (final Allele a: getAlleles()) { if (a.isSymbolic()) { return true; } @@ -1422,136 +1222,12 @@ private boolean hasSymbolicAlleles() { return false; } - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { - - // see if we need to pad common reference base from all alleles - boolean padVC; - - // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. - // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). - long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; - if (inputVC.hasSymbolicAlleles()) - padVC = true; - else if (inputVC.getReference().length() == locLength) - padVC = false; - else if (inputVC.getReference().length() == locLength-1) - padVC = true; - else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + - " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - - // nothing to do if we don't need to pad bases - if (padVC) { - - if ( !inputVC.hasReferenceBaseForIndel() ) - throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - - Byte refByte = inputVC.getReferenceBaseForIndel(); - - List alleles = new ArrayList(); - Map genotypes = new TreeMap(); - - Map inputGenotypes = inputVC.getGenotypes(); - - for (Allele a : inputVC.getAlleles()) { - // get bases for current allele and create a new one with trimmed bases - if (a.isSymbolic()) { - alleles.add(a); - } else { - String newBases; - if ( refBaseShouldBeAppliedToEndOfAlleles ) - newBases = a.getBaseString() + new String(new byte[]{refByte}); - else - newBases = new String(new byte[]{refByte}) + a.getBaseString(); - alleles.add(Allele.create(newBases,a.isReference())); - } - } - - // now we can recreate new genotypes with trimmed alleles - for (String sample : inputVC.getSampleNames()) { - Genotype g = inputGenotypes.get(sample); - - List inAlleles = g.getAlleles(); - List newGenotypeAlleles = new ArrayList(); - for (Allele a : inAlleles) { - if (a.isCalled()) { - if (a.isSymbolic()) { - newGenotypeAlleles.add(a); - } else { - String newBases; - if ( refBaseShouldBeAppliedToEndOfAlleles ) - newBases = a.getBaseString() + new String(new byte[]{refByte}); - else - newBases = new String(new byte[]{refByte}) + a.getBaseString(); - newGenotypeAlleles.add(Allele.create(newBases,a.isReference())); - } - } - else { - // add no-call allele - newGenotypeAlleles.add(Allele.NO_CALL); - } - } - genotypes.put(sample, new Genotype(sample, newGenotypeAlleles, g.getNegLog10PError(), - g.getFilters(),g.getAttributes(),g.isPhased())); - - } - - // Do not change the filter state if filters were not applied to this context - Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes(),refByte); - } - else - return inputVC; - - } - - public ArrayList getTwoAllelesWithHighestAlleleCounts() { - // first idea: get two alleles with highest AC - int maxAC1 = 0, maxAC2=0,maxAC1ind =0, maxAC2ind = 0; - int i=0; - int[] alleleCounts = new int[this.getAlleles().size()]; - ArrayList alleleArray = new ArrayList(); - for (Allele a:this.getAlleles()) { - int ac = this.getChromosomeCount(a); - if (ac >=maxAC1) { - maxAC1 = ac; - maxAC1ind = i; - } - alleleArray.add(a); - alleleCounts[i++] = ac; - } - // now get second best allele - for (i=0; i < alleleCounts.length; i++) { - if (i == maxAC1ind) - continue; - if (alleleCounts[i] >= maxAC2) { - maxAC2 = alleleCounts[i]; - maxAC2ind = i; - } - } - - Allele alleleA, alleleB; - if (alleleArray.get(maxAC1ind).isReference()) { - alleleA = alleleArray.get(maxAC1ind); - alleleB = alleleArray.get(maxAC2ind); - } - else if (alleleArray.get(maxAC2ind).isReference()) { - alleleA = alleleArray.get(maxAC2ind); - alleleB = alleleArray.get(maxAC1ind); - } else { - alleleA = alleleArray.get(maxAC1ind); - alleleB = alleleArray.get(maxAC2ind); - } - ArrayList a = new ArrayList(); - a.add(alleleA); - a.add(alleleB); - return a; - } public Allele getAltAlleleWithHighestAlleleCount() { // first idea: get two alleles with highest AC Allele best = null; int maxAC1 = 0; for (Allele a:this.getAlternateAlleles()) { - int ac = this.getChromosomeCount(a); + int ac = this.getCalledChrCount(a); if (ac >=maxAC1) { maxAC1 = ac; best = a; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java new file mode 100644 index 0000000000..b79584df8c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.java.contract.*; +import org.broad.tribble.Feature; +import org.broad.tribble.TribbleException; +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; + +import java.util.*; + +/** + * Builder class for VariantContext + * + * Some basic assumptions here: + * + * 1 -- data isn't protectively copied. If you provide an attribute map to + * the build, and modify it later, the builder will see this and so will any + * resulting variant contexts. It's best not to modify collections provided + * to a builder. + * + * 2 -- the system uses the standard builder model, allowing the simple construction idiom: + * + * builder.source("a").genotypes(gc).id("x").make() => VariantContext + * + * 3 -- The best way to copy a VariantContext is: + * + * new VariantContextBuilder(vc).make() => a copy of VC + * + * 4 -- validation of arguments is done at the during the final make() call, so a + * VariantContextBuilder can exist in an inconsistent state as long as those issues + * are resolved before the call to make() is issued. + * + * @author depristo + */ +public class VariantContextBuilder { + // required fields + private String source = null; + private String contig = null; + private long start = -1; + private long stop = -1; + private Collection alleles = null; + + // optional -> these are set to the appropriate default value + private String ID = VCFConstants.EMPTY_ID_FIELD; + private GenotypesContext genotypes = GenotypesContext.NO_GENOTYPES; + private double log10PError = VariantContext.NO_LOG10_PERROR; + private Set filters = null; + private Map attributes = null; + private boolean attributesCanBeModified = false; + private Byte referenceBaseForIndel = null; + + /** enum of what must be validated */ + final private EnumSet toValidate = EnumSet.noneOf(VariantContext.Validation.class); + + /** + * Create an empty VariantContextBuilder where all values adopt their default values. Note that + * source, chr, start, stop, and alleles must eventually be filled in, or the resulting VariantContext + * will throw an error. + */ + public VariantContextBuilder() {} + + /** + * Create an empty VariantContextBuilder where all values adopt their default values, but the bare min. + * of info (source, chr, start, stop, and alleles) have been provided to start. + */ + @Requires({"source != null", "contig != null", "start >= 0", "stop >= 0", + "alleles != null && !alleles.isEmpty()"}) + public VariantContextBuilder(String source, String contig, long start, long stop, Collection alleles) { + this.source = source; + this.contig = contig; + this.start = start; + this.stop = stop; + this.alleles = alleles; + toValidate.add(VariantContext.Validation.ALLELES); + } + + /** + * Returns a new builder based on parent -- the new VC will have all fields initialized + * to their corresponding values in parent. This is the best way to create a derived VariantContext + * + * @param parent + */ + public VariantContextBuilder(VariantContext parent) { + this.alleles = parent.alleles; + this.attributes = parent.getAttributes(); + this.attributesCanBeModified = false; + this.contig = parent.contig; + this.filters = parent.getFiltersMaybeNull(); + this.genotypes = parent.genotypes; + this.ID = parent.getID(); + this.log10PError = parent.getLog10PError(); + this.referenceBaseForIndel = parent.getReferenceBaseForIndel(); + this.source = parent.getSource(); + this.start = parent.getStart(); + this.stop = parent.getEnd(); + } + + /** + * Tells this builder to use this collection of alleles for the resulting VariantContext + * + * @param alleles + * @return this builder + */ + @Requires({"alleles != null", "!alleles.isEmpty()"}) + public VariantContextBuilder alleles(final Collection alleles) { + this.alleles = alleles; + toValidate.add(VariantContext.Validation.ALLELES); + return this; + } + + /** + * Tells this builder to use this map of attributes alleles for the resulting VariantContext + * + * Attributes can be null -> meaning there are no attributes. After + * calling this routine the builder assumes it can modify the attributes + * object here, if subsequent calls are made to set attribute values + * @param attributes + */ + public VariantContextBuilder attributes(final Map attributes) { + this.attributes = attributes; + this.attributesCanBeModified = true; + return this; + } + + /** + * Puts the key -> value mapping into this builder's attributes + * + * @param key + * @param value + * @return + */ + @Requires({"key != null"}) + @Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()+1)"}) + public VariantContextBuilder attribute(final String key, final Object value) { + makeAttributesModifiable(); + attributes.put(key, value); + return this; + } + + /** + * Removes key if present in the attributes + * + * @param key + * @return + */ + @Requires({"key != null"}) + @Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()-1)"}) + public VariantContextBuilder rmAttribute(final String key) { + makeAttributesModifiable(); + attributes.remove(key); + return this; + } + + /** + * Makes the attributes field modifiable. In many cases attributes is just a pointer to an immutable + * collection, so methods that want to add / remove records require the attributes to be copied to a + */ + private void makeAttributesModifiable() { + if ( ! attributesCanBeModified ) { + this.attributesCanBeModified = true; + this.attributes = new HashMap(attributes); + } + } + + /** + * This builder's filters are set to this value + * + * filters can be null -> meaning there are no filters + * @param filters + */ + public VariantContextBuilder filters(final Set filters) { + this.filters = filters; + return this; + } + + /** + * {@link #filters} + * + * @param filters + * @return + */ + public VariantContextBuilder filters(final String ... filters) { + filters(new HashSet(Arrays.asList(filters))); + return this; + } + + /** + * Tells this builder that the resulting VariantContext should have PASS filters + * + * @return + */ + public VariantContextBuilder passFilters() { + return filters(VariantContext.PASSES_FILTERS); + } + + /** + * Tells this builder that the resulting VariantContext be unfiltered + * + * @return + */ + public VariantContextBuilder unfiltered() { + this.filters = null; + return this; + } + + /** + * Tells this builder that the resulting VariantContext should use this genotypes GenotypeContext + * + * Note that genotypes can be null -> meaning there are no genotypes + * + * @param genotypes + */ + public VariantContextBuilder genotypes(final GenotypesContext genotypes) { + this.genotypes = genotypes; + if ( genotypes != null ) + toValidate.add(VariantContext.Validation.GENOTYPES); + return this; + } + + public VariantContextBuilder genotypesNoValidation(final GenotypesContext genotypes) { + this.genotypes = genotypes; + return this; + } + + /** + * Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes + * + * Note that genotypes can be null -> meaning there are no genotypes + * + * @param genotypes + */ + public VariantContextBuilder genotypes(final Collection genotypes) { + return genotypes(GenotypesContext.copy(genotypes)); + } + + /** + * Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes + * @param genotypes + */ + public VariantContextBuilder genotypes(final Genotype ... genotypes) { + return genotypes(GenotypesContext.copy(Arrays.asList(genotypes))); + } + + /** + * Tells this builder that the resulting VariantContext should not contain any GenotypeContext + */ + public VariantContextBuilder noGenotypes() { + this.genotypes = null; + return this; + } + + /** + * Tells us that the resulting VariantContext should have ID + * @param ID + * @return + */ + @Requires("ID != null") + public VariantContextBuilder id(final String ID) { + this.ID = ID; + return this; + } + + /** + * Tells us that the resulting VariantContext should not have an ID + * @return + */ + public VariantContextBuilder noID() { + return id(VCFConstants.EMPTY_ID_FIELD); + } + + /** + * Tells us that the resulting VariantContext should have log10PError + * @param log10PError + * @return + */ + @Requires("log10PError <= 0 || log10PError == VariantContext.NO_LOG10_PERROR") + public VariantContextBuilder log10PError(final double log10PError) { + this.log10PError = log10PError; + return this; + } + + /** + * Tells us that the resulting VariantContext should use this byte for the reference base + * Null means no refBase is available + * @param referenceBaseForIndel + */ + public VariantContextBuilder referenceBaseForIndel(final Byte referenceBaseForIndel) { + this.referenceBaseForIndel = referenceBaseForIndel; + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + + /** + * Tells us that the resulting VariantContext should have source field set to source + * @param source + * @return + */ + @Requires("source != null") + public VariantContextBuilder source(final String source) { + this.source = source; + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified location + * @param contig + * @param start + * @param stop + * @return + */ + @Requires({"contig != null", "start >= 0", "stop >= 0"}) + public VariantContextBuilder loc(final String contig, final long start, final long stop) { + this.contig = contig; + this.start = start; + this.stop = stop; + toValidate.add(VariantContext.Validation.ALLELES); + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified contig chr + * @param contig + * @return + */ + @Requires({"contig != null"}) + public VariantContextBuilder chr(final String contig) { + this.contig = contig; + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified contig start + * @param start + * @return + */ + @Requires({"start >= 0"}) + public VariantContextBuilder start(final long start) { + this.start = start; + toValidate.add(VariantContext.Validation.ALLELES); + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + + /** + * Tells us that the resulting VariantContext should have the specified contig stop + * @param stop + * @return + */ + @Requires({"stop >= 0"}) + public VariantContextBuilder stop(final long stop) { + this.stop = stop; + return this; + } + + /** + * Takes all of the builder data provided up to this point, and instantiates + * a freshly allocated VariantContext with all of the builder data. This + * VariantContext is validated as appropriate and if not failing QC (and + * throwing an exception) is returned. + * + * Note that this function can be called multiple times to create multiple + * VariantContexts from the same builder. + */ + public VariantContext make() { + return new VariantContext(source, ID, contig, start, stop, alleles, + genotypes, log10PError, filters, attributes, + referenceBaseForIndel, toValidate); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 43f91041f2..c9a4965c1c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -25,13 +25,10 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.util.StringUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -58,84 +55,38 @@ public class VariantContextUtils { } /** - * Create a new VariantContext - * - * @param name name - * @param loc location - * @param alleles alleles - * @param genotypes genotypes set - * @param negLog10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes != null ? VariantContext.genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * @param name name - * @param loc location - * @param alleles alleles - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles) { - return new VariantContext (name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, VariantContext.NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes - * @param name name - * @param loc location - * @param alleles alleles - * @param genotypes genotypes - * @return VariantContext object - */ - public static VariantContext toVC(String name, GenomeLoc loc, Collection alleles, Collection genotypes) { - return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); - } - - /** - * Copy constructor - * - * @param other the VariantContext to copy - * @return VariantContext object - */ - public static VariantContext toVC(VariantContext other) { - return new VariantContext(other.getSource(), other.getChr(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.getFilters(), other.getAttributes()); - } - - /** - * Update the attributes of the attributes map given the VariantContext to reflect the proper chromosome-based VCF tags + * Update the attributes of the attributes map given the VariantContext to reflect the + * proper chromosome-based VCF tags * * @param vc the VariantContext * @param attributes the attributes map to populate; must not be null; may contain old values * @param removeStaleValues should we remove stale values from the mapping? + * @return the attributes map provided as input, returned for programming convenience */ - public static void calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { + public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { // if everyone is a no-call, remove the old attributes if requested - if ( vc.getChromosomeCount() == 0 && removeStaleValues ) { + if ( vc.getCalledChrCount() == 0 && removeStaleValues ) { if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) attributes.remove(VCFConstants.ALLELE_COUNT_KEY); if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); - return; + return attributes; } if ( vc.hasGenotypes() ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getChromosomeCount()); + attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getCalledChrCount()); // if there are alternate alleles, record the relevant tags if ( vc.getAlternateAlleles().size() > 0 ) { ArrayList alleleFreqs = new ArrayList(); ArrayList alleleCounts = new ArrayList(); - double totalChromosomes = (double)vc.getChromosomeCount(); + double totalChromosomes = (double)vc.getCalledChrCount(); for ( Allele allele : vc.getAlternateAlleles() ) { - int altChromosomes = vc.getChromosomeCount(allele); + int altChromosomes = vc.getCalledChrCount(allele); alleleCounts.add(altChromosomes); + // todo -- this is a performance problem String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); alleleFreqs.add(freq); } @@ -148,6 +99,54 @@ public static void calculateChromosomeCounts(VariantContext vc, Map 0 ) { + ArrayList alleleFreqs = new ArrayList(); + ArrayList alleleCounts = new ArrayList(); + double totalChromosomes = (double)vc.getCalledChrCount(); + for ( Allele allele : vc.getAlternateAlleles() ) { + int altChromosomes = vc.getCalledChrCount(allele); + alleleCounts.add(altChromosomes); + String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double)altChromosomes / totalChromosomes)); + alleleFreqs.add(freq); + } + + builder.attribute(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); + builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + } + else { + builder.attribute(VCFConstants.ALLELE_COUNT_KEY, 0); + builder.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); + } + } } private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { @@ -165,7 +164,81 @@ public static Genotype removePLs(Genotype g) { Map attrs = new HashMap(g.getAttributes()); attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); + return new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); + } + + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { + // see if we need to pad common reference base from all alleles + boolean padVC; + + // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. + // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). + long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; + if (inputVC.hasSymbolicAlleles()) + padVC = true; + else if (inputVC.getReference().length() == locLength) + padVC = false; + else if (inputVC.getReference().length() == locLength-1) + padVC = true; + else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); + + // nothing to do if we don't need to pad bases + if (padVC) { + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); + + Byte refByte = inputVC.getReferenceBaseForIndel(); + + List alleles = new ArrayList(); + + for (Allele a : inputVC.getAlleles()) { + // get bases for current allele and create a new one with trimmed bases + if (a.isSymbolic()) { + alleles.add(a); + } else { + String newBases; + if ( refBaseShouldBeAppliedToEndOfAlleles ) + newBases = a.getBaseString() + new String(new byte[]{refByte}); + else + newBases = new String(new byte[]{refByte}) + a.getBaseString(); + alleles.add(Allele.create(newBases,a.isReference())); + } + } + + // now we can recreate new genotypes with trimmed alleles + GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); + for (final Genotype g : inputVC.getGenotypes() ) { + List inAlleles = g.getAlleles(); + List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); + for (Allele a : inAlleles) { + if (a.isCalled()) { + if (a.isSymbolic()) { + newGenotypeAlleles.add(a); + } else { + String newBases; + if ( refBaseShouldBeAppliedToEndOfAlleles ) + newBases = a.getBaseString() + new String(new byte[]{refByte}); + else + newBases = new String(new byte[]{refByte}) + a.getBaseString(); + newGenotypeAlleles.add(Allele.create(newBases,a.isReference())); + } + } + else { + // add no-call allele + newGenotypeAlleles.add(Allele.NO_CALL); + } + } + genotypes.add(new Genotype(g.getSampleName(), newGenotypeAlleles, g.getLog10PError(), + g.getFilters(), g.getAttributes(), g.isPhased())); + + } + + return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); + } + else + return inputVC; + } /** @@ -296,7 +369,7 @@ public static Map match(VariantContext vc, Genotype g, } public static double computeHardyWeinbergPvalue(VariantContext vc) { - if ( vc.getChromosomeCount() == 0 ) + if ( vc.getCalledChrCount() == 0 ) return 0.0; return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); } @@ -309,7 +382,7 @@ public static double computeHardyWeinbergPvalue(VariantContext vc) { @Requires("vc != null") @Ensures("result != null") public static VariantContext sitesOnlyVariantContext(VariantContext vc) { - return VariantContext.modifyGenotypes(vc, null); + return new VariantContextBuilder(vc).noGenotypes().make(); } /** @@ -326,39 +399,42 @@ public static Collection sitesOnlyVariantContexts(Collection subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { + Map attributes = new HashMap(keysToPreserve.size()); + for ( final String key : keysToPreserve ) { + if ( igc.hasAttribute(key) ) + attributes.put(key, igc.getAttribute(key)); + } + return attributes; } - public static VariantContext pruneVariantContext(final VariantContext vc, final Collection keysToPreserve ) { - final MutableVariantContext mvc = new MutableVariantContext(vc); + /** + * @deprecated use variant context builder version instead + * @param vc + * @param keysToPreserve + * @return + */ + @Deprecated + public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { + return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); + } - if ( keysToPreserve == null || keysToPreserve.size() == 0 ) - mvc.clearAttributes(); - else { - final Map d = mvc.getAttributes(); - mvc.clearAttributes(); - for ( String key : keysToPreserve ) - if ( d.containsKey(key) ) - mvc.putAttribute(key, d.get(key)); - } + public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { + final VariantContext vc = builder.make(); + if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); - // this must be done as the ID is stored in the attributes field - if ( vc.hasID() ) mvc.setID(vc.getID()); - - Collection gs = mvc.getGenotypes().values(); - mvc.clearGenotypes(); - for ( Genotype g : gs ) { - MutableGenotype mg = new MutableGenotype(g); - mg.clearAttributes(); - if ( keysToPreserve != null ) - for ( String key : keysToPreserve ) - if ( g.hasAttribute(key) ) - mg.putAttribute(key, g.getAttribute(key)); - mvc.addGenotype(mg); + // VC info + final Map attributes = subsetAttributes(vc.commonInfo, keysToPreserve); + + // Genotypes + final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + Map genotypeAttributes = subsetAttributes(g.commonInfo, keysToPreserve); + genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.getFilters(), + genotypeAttributes, g.isPhased())); } - return mvc; + return builder.genotypes(genotypes).attributes(attributes); } public enum GenotypeMergeType { @@ -391,75 +467,6 @@ public enum FilteredRecordMergeType { KEEP_IF_ALL_UNFILTERED } - /** - * Performs a master merge on the VCs. Here there is a master input [contains all of the information] and many - * VCs containing partial, extra genotype information which should be added to the master. For example, - * we scatter out the phasing algorithm over some samples in the master, producing a minimal VCF with phasing - * information per genotype. The master merge will add the PQ information from each genotype record, where - * appropriate, to the master VC. - * - * @param unsortedVCs collection of VCs - * @param masterName name of master VC - * @return master-merged VC - */ - public static VariantContext masterMerge(Collection unsortedVCs, String masterName) { - VariantContext master = findMaster(unsortedVCs, masterName); - Map genotypes = master.getGenotypes(); - for (Genotype g : genotypes.values()) { - genotypes.put(g.getSampleName(), new MutableGenotype(g)); - } - - Map masterAttributes = new HashMap(master.getAttributes()); - - for (VariantContext vc : unsortedVCs) { - if (!vc.getSource().equals(masterName)) { - for (Genotype g : vc.getGenotypes().values()) { - MutableGenotype masterG = (MutableGenotype) genotypes.get(g.getSampleName()); - for (Map.Entry attr : g.getAttributes().entrySet()) { - if (!masterG.hasAttribute(attr.getKey())) { - //System.out.printf("Adding GT attribute %s to masterG %s, new %s%n", attr, masterG, g); - masterG.putAttribute(attr.getKey(), attr.getValue()); - } - } - - if (masterG.isPhased() != g.isPhased()) { - if (masterG.sameGenotype(g)) { - // System.out.printf("Updating phasing %s to masterG %s, new %s%n", g.isPhased(), masterG, g); - masterG.setAlleles(g.getAlleles()); - masterG.setPhase(g.isPhased()); - } - //else System.out.println("WARNING: Not updating phase, since genotypes differ between master file and auxiliary info file!"); - } - -// if ( MathUtils.compareDoubles(masterG.getNegLog10PError(), g.getNegLog10PError()) != 0 ) { -// System.out.printf("Updating GQ %s to masterG %s, new %s%n", g.getNegLog10PError(), masterG, g); -// masterG.setNegLog10PError(g.getNegLog10PError()); -// } - - } - - for (Map.Entry attr : vc.getAttributes().entrySet()) { - if (!masterAttributes.containsKey(attr.getKey())) { - //System.out.printf("Adding VC attribute %s to master %s, new %s%n", attr, master, vc); - masterAttributes.put(attr.getKey(), attr.getValue()); - } - } - } - } - - return new VariantContext(master.getSource(), master.getChr(), master.getStart(), master.getEnd(), master.getAlleles(), genotypes, master.getNegLog10PError(), master.getFilters(), masterAttributes); - } - - private static VariantContext findMaster(Collection unsortedVCs, String masterName) { - for (VariantContext vc : unsortedVCs) { - if (vc.getSource().equals(masterName)) { - return vc; - } - } - - throw new ReviewedStingException(String.format("Couldn't find master VCF %s at %s", masterName, unsortedVCs.iterator().next())); - } - /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with @@ -503,7 +510,7 @@ public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, false)); + VCs.add(createVariantContextWithPaddedAlleles(vc, false)); } if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -524,9 +531,9 @@ public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, int depth = 0; int maxAC = -1; final Map attributesWithMaxAC = new TreeMap(); - double negLog10PError = -1; + double log10PError = 1; VariantContext vcWithMaxAC = null; - Map genotypes = new TreeMap(); + GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; @@ -552,7 +559,7 @@ public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); + log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); filters.addAll(vc.getFilters()); @@ -563,7 +570,7 @@ public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if ( vc.hasID() && ! vc.getID().equals(VCFConstants.EMPTY_ID_FIELD) ) rsIDs.add(vc.getID()); + if ( vc.hasID() ) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator @@ -612,8 +619,9 @@ public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, if (vc.alleles.size() == 1) continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { - logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", - genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); + if ( ! genotypes.isEmpty() ) + logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); @@ -656,14 +664,17 @@ else if ( variantSources.isEmpty() ) // everyone was reference if ( depth > 0 ) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - if ( ! rsIDs.isEmpty() ) { - attributes.put(VariantContext.ID_KEY, Utils.join(",", rsIDs)); - } + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) ); - // Trim the padded bases of all alleles if necessary - merged = createVariantContextWithTrimmedAlleles(merged); + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); + builder.loc(loc.getContig(), loc.getStart(), loc.getStop()); + builder.alleles(alleles); + builder.genotypes(genotypes); + builder.log10PError(log10PError); + builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); + // Trim the padded bases of all alleles if necessary + VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } @@ -698,6 +709,7 @@ public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { return true; } + public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; @@ -716,7 +728,7 @@ else if (refAllele.isNull()) // nothing to do if we don't need to trim bases if (trimVC) { List alleles = new ArrayList(); - Map genotypes = new TreeMap(); + GenotypesContext genotypes = GenotypesContext.create(); // set the reference base for indels in the attributes Map attributes = new TreeMap(inputVC.getAttributes()); @@ -750,9 +762,9 @@ else if (refAllele.isNull()) if (!hasNullAlleles) return inputVC; // now we can recreate new genotypes with trimmed alleles - for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { + for ( final Genotype genotype : inputVC.getGenotypes() ) { - List originalAlleles = sample.getValue().getAlleles(); + List originalAlleles = genotype.getAlleles(); List trimmedAlleles = new ArrayList(); for ( Allele a : originalAlleles ) { if ( a.isCalled() ) @@ -760,21 +772,22 @@ else if (refAllele.isNull()) else trimmedAlleles.add(Allele.NO_CALL); } - genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); + genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); + final VariantContextBuilder builder = new VariantContextBuilder(inputVC); + return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); } return inputVC; } - public static Map stripPLs(Map genotypes) { - Map newGs = new HashMap(genotypes.size()); + public static GenotypesContext stripPLs(GenotypesContext genotypes) { + GenotypesContext newGs = GenotypesContext.create(genotypes.size()); - for ( Map.Entry g : genotypes.entrySet() ) { - newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? removePLs(g.getValue()) : g.getValue()); + for ( final Genotype g : genotypes ) { + newGs.add(g.hasLikelihoods() ? removePLs(g) : g); } return newGs; @@ -951,20 +964,19 @@ public static List sortVariantContextsByPriority(Collection mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { - for ( Genotype g : oneVC.getGenotypes().values() ) { + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + for ( Genotype g : oneVC.getGenotypes() ) { String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); - if ( ! mergedGenotypes.containsKey(name) ) { + if ( ! mergedGenotypes.containsSample(name) ) { // only add if the name is new Genotype newG = g; if ( uniqifySamples || alleleMapping.needsRemapping() ) { - MutableGenotype mutG = new MutableGenotype(name, g); - if ( alleleMapping.needsRemapping() ) mutG.setAlleles(alleleMapping.remap(g.getAlleles())); - newG = mutG; + final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); + newG = new Genotype(name, alleles, g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased()); } - mergedGenotypes.put(name, newG); + mergedGenotypes.add(newG); } } } @@ -992,37 +1004,36 @@ public static VariantContext reverseComplement(VariantContext vc) { } // create new Genotype objects - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getValue().getAlleles() ) { + for ( Allele allele : genotype.getAlleles() ) { Allele newAllele = alleleMap.get(allele); if ( newAllele == null ) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } - newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); + newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); } - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); - + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); } public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { if ( allowedAttributes == null ) return vc; - Map newGenotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { Map attrs = new HashMap(); - for ( Map.Entry attr : genotype.getValue().getAttributes().entrySet() ) { + for ( Map.Entry attr : genotype.getAttributes().entrySet() ) { if ( allowedAttributes.contains(attr.getKey()) ) attrs.put(attr.getKey(), attr.getValue()); } - newGenotypes.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attrs)); + newGenotypes.add(Genotype.modifyAttributes(genotype, attrs)); } - return VariantContext.modifyGenotypes(vc, newGenotypes); + return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); } public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { @@ -1055,355 +1066,10 @@ public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,Varian return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); } - public abstract static class AlleleMergeRule { - // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): - abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); - - public String toString() { - return "all samples are mergeable"; - } - } - - // NOTE: returns null if vc1 and vc2 are not merged into a single MNP record - - public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { - if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) - return null; - - // Check that it's logically possible to merge the VCs: - if (!allSamplesAreMergeable(vc1, vc2)) - return null; - - // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) - if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) - return null; - - return reallyMergeIntoMNP(vc1, vc2, referenceFile); - } - - private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { - int startInter = vc1.getEnd() + 1; - int endInter = vc2.getStart() - 1; - byte[] intermediateBases = null; - if (startInter <= endInter) { - intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); - StringUtil.toUpperCase(intermediateBases); - } - MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added - - Map mergedGenotypes = new HashMap(); - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - List mergedAllelesForSample = new LinkedList(); - - /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, - we preserve phase information (if any) relative to whatever precedes vc1: - */ - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() - - Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); - mergedAllelesForSample.add(mergedAllele); - } - - double mergedGQ = Math.max(gt1.getNegLog10PError(), gt2.getNegLog10PError()); - Set mergedGtFilters = new HashSet(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered - - Map mergedGtAttribs = new HashMap(); - PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); - if (phaseQual.PQ != null) - mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); - - Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); - mergedGenotypes.put(sample, mergedGt); - } - - String mergedName = VariantContextUtils.mergeVariantContextNames(vc1.getSource(), vc2.getSource()); - double mergedNegLog10PError = Math.max(vc1.getNegLog10PError(), vc2.getNegLog10PError()); - Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered - Map mergedAttribs = VariantContextUtils.mergeVariantContextAttributes(vc1, vc2); - - VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs); - - mergedAttribs = new HashMap(mergedVc.getAttributes()); - VariantContextUtils.calculateChromosomeCounts(mergedVc, mergedAttribs, true); - mergedVc = VariantContext.modifyAttributes(mergedVc, mergedAttribs); - - return mergedVc; - } - - private static class AlleleOneAndTwo { - private Allele all1; - private Allele all2; - - public AlleleOneAndTwo(Allele all1, Allele all2) { - this.all1 = all1; - this.all2 = all2; - } - - public int hashCode() { - return all1.hashCode() + all2.hashCode(); - } - - public boolean equals(Object other) { - if (!(other instanceof AlleleOneAndTwo)) - return false; - - AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; - return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); - } - } - - private static class MergedAllelesData { - private Map mergedAlleles; - private byte[] intermediateBases; - private int intermediateLength; - - public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { - this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo - this.intermediateBases = intermediateBases; - this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; - - this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); - } - - public Allele ensureMergedAllele(Allele all1, Allele all2) { - return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor - } - - private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { - AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); - Allele mergedAllele = mergedAlleles.get(all12); - - if (mergedAllele == null) { - byte[] bases1 = all1.getBases(); - byte[] bases2 = all2.getBases(); - - byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; - System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); - if (intermediateBases != null) - System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); - System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); - - mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); - mergedAlleles.put(all12, mergedAllele); - } - - return mergedAllele; - } - - public Set getAllMergedAlleles() { - return new HashSet(mergedAlleles.values()); - } - } - - private static String mergeVariantContextNames(String name1, String name2) { - return name1 + "_" + name2; - } - - private static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { - Map mergedAttribs = new HashMap(); - - List vcList = new LinkedList(); - vcList.add(vc1); - vcList.add(vc2); - - String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; - for (String orAttrib : MERGE_OR_ATTRIBS) { - boolean attribVal = false; - for (VariantContext vc : vcList) { - attribVal = vc.getAttributeAsBoolean(orAttrib, false); - if (attribVal) // already true, so no reason to continue: - break; - } - mergedAttribs.put(orAttrib, attribVal); - } - - // Merge ID fields: - String iDVal = null; - for (VariantContext vc : vcList) { - String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); - if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { - if (iDVal == null) - iDVal = val; - else - iDVal += VCFConstants.ID_FIELD_SEPARATOR + val; - } - } - if (iDVal != null) - mergedAttribs.put(VariantContext.ID_KEY, iDVal); - - return mergedAttribs; - } - - private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { - GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); - GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); - - if (!loc1.onSameContig(loc2)) - throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); - - if (!loc1.isBefore(loc2)) - throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); - - if (vc1.isFiltered() || vc2.isFiltered()) - return false; - - if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets - return false; - - if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) - return false; - - return true; - } - - private static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) { - for (Map.Entry gtEntry : vc.getGenotypes().entrySet()) { - Genotype gt = gtEntry.getValue(); - if (gt.isNoCall() || gt.isFiltered()) - return false; - } - - return true; - } - - // Assumes that vc1 and vc2 were already checked to have the same sample names: - - private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { - // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom - return false; - } - - return true; - } - - public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { - if (gt1.getPloidy() != gt2.getPloidy()) - return false; - - /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. - - HOWEVER, EVEN if this is not the case, but gt1.isHom(), - it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. - */ - return (gt2.isPhased() || gt2.isHom() || gt1.isHom()); - } - - private static class PhaseAndQuality { - public boolean isPhased; - public Double PQ = null; - - public PhaseAndQuality(Genotype gt) { - this.isPhased = gt.isPhased(); - if (this.isPhased) { - this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); - if ( this.PQ == -1 ) this.PQ = null; - } - } - } - - // Assumes that alleleSegregationIsKnown(gt1, gt2): - - private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { - if (gt2.isPhased() || gt2.isHom()) - return new PhaseAndQuality(gt1); // maintain the phase of gt1 - - if (!gt1.isHom()) - throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); - - /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype - - For example, if we're merging the third Genotype with the second one: - 0/1 - 1|1 - 0/1 - - Then, we want to output: - 0/1 - 1/2 - */ - return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()] - } - - /* Checks if any sample has a MNP of ALT alleles (segregating together): - [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] - */ - - public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() - - if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate - return true; - } - } - - return false; - } - - /* Checks if all samples are consistent in their haplotypes: - [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] - */ - - public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { - // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): - Map allele1ToAllele2 = new HashMap(); - Map allele2ToAllele1 = new HashMap(); - - // Note the segregation of the alleles for the reference genome: - allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); - allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); - - // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). - for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { - String sample = gt1Entry.getKey(); - Genotype gt1 = gt1Entry.getValue(); - Genotype gt2 = vc2.getGenotype(sample); - - List site1Alleles = gt1.getAlleles(); - List site2Alleles = gt2.getAlleles(); - - Iterator all2It = site2Alleles.iterator(); - for (Allele all1 : site1Alleles) { - Allele all2 = all2It.next(); - - Allele all1To2 = allele1ToAllele2.get(all1); - if (all1To2 == null) - allele1ToAllele2.put(all1, all2); - else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 - return false; - - Allele all2To1 = allele2ToAllele1.get(all2); - if (all2To1 == null) - allele2ToAllele1.put(all2, all1); - else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 - return false; - } - } - - return true; + public static final Set genotypeNames(final Collection genotypes) { + final Set names = new HashSet(genotypes.size()); + for ( final Genotype g : genotypes ) + names.add(g.getSampleName()); + return names; } -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java index a59ed7abe3..ccce21f52d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java @@ -64,7 +64,7 @@ private interface AttributeGetter { x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }}); x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }}); x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }}); - x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return 10 * vc.getNegLog10PError(); }}); + x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return -10 * vc.getLog10PError(); }}); x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }}); x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }}); x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }}); diff --git a/public/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java b/public/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java deleted file mode 100644 index 157b1ce27e..0000000000 --- a/public/java/src/org/broadinstitute/sting/utils/yaml/StingYamlRepresenter.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.yaml; - -import org.yaml.snakeyaml.introspector.Property; -import org.yaml.snakeyaml.nodes.*; -import org.yaml.snakeyaml.representer.Represent; -import org.yaml.snakeyaml.representer.Representer; - -import java.beans.IntrospectionException; -import java.io.File; -import java.util.Set; -import java.util.TreeSet; - -/** - * A representer with Sting prefered settings. - * - Fields are ordered in the order of the class declaration, instead of alphabetically. - * - Empty maps and sequences are not output. - * - Files are converted to their absolute paths. - */ -public class StingYamlRepresenter extends Representer { - - public StingYamlRepresenter() { - super(); - this.representers.put(File.class, new RepresentFile()); - } - - @Override - protected Set getProperties(Class type) throws IntrospectionException { - TreeSet properties = new TreeSet(new FieldOrderComparator(type)); - properties.addAll(super.getProperties(type)); - return properties; - } - - @Override - protected NodeTuple representJavaBeanProperty(Object javaBean, Property property, - Object propertyValue, Tag customTag) { - NodeTuple tuple = super.representJavaBeanProperty(javaBean, property, propertyValue, customTag); - Node valueNode = tuple.getValueNode(); - if (Tag.NULL.equals(valueNode.getTag())) { - return null;// skip 'null' values - } - if (valueNode instanceof CollectionNode) { - if (Tag.SEQ.equals(valueNode.getTag())) { - SequenceNode seq = (SequenceNode) valueNode; - if (seq.getValue().isEmpty()) { - return null;// skip empty lists - } - } - if (Tag.MAP.equals(valueNode.getTag())) { - MappingNode seq = (MappingNode) valueNode; - if (seq.getValue().isEmpty()) { - return null;// skip empty maps - } - } - } - return tuple; - } - - private class RepresentFile implements Represent { - @Override - public Node representData(Object o) { - return StingYamlRepresenter.this.representScalar(Tag.STR, ((File)o).getPath()); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java b/public/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java deleted file mode 100644 index 715c71efc2..0000000000 --- a/public/java/src/org/broadinstitute/sting/utils/yaml/YamlUtils.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.yaml; - -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.yaml.snakeyaml.DumperOptions; -import org.yaml.snakeyaml.Yaml; -import org.yaml.snakeyaml.constructor.Constructor; -import org.yaml.snakeyaml.nodes.Tag; -import org.yaml.snakeyaml.representer.Representer; - -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; - -/** - * A collection of utilities for operating on YAML. - * Uses the FLOW style of writing YAML, versus the BLOCK style. - * By default uses a representer that prunes empty lists and maps. - */ -public class YamlUtils { - private static Representer representer = new StingYamlRepresenter(); - private static DumperOptions options = new DumperOptions(); - - static { - options.setCanonical(false); - options.setExplicitRoot(Tag.MAP); - options.setDefaultFlowStyle(DumperOptions.FlowStyle.FLOW); - options.setPrettyFlow(true); - } - - /** - * Serialize an object to the file system. - * @param o Object to serialize. - * @param file Path to write the serialized YAML. - */ - public static void dump(Object o, File file) { - dump(o, file, representer); - } - - /** - * Serialize an object to the file system. - * @param o Object to serialize. - * @param file Path to write the serialized YAML. - * @param representer Custom representer with rules on how to serialize YAML. - */ - public static void dump(Object o, File file, Representer representer) { - Constructor constructor = new Constructor(o.getClass()); - Yaml yaml = new Yaml(constructor, representer, options); - try { - yaml.dump(o, new FileWriter(file)); - } catch (IOException ioe) { - throw new UserException.CouldNotCreateOutputFile(file, ioe); - } - } - - /** - * Deserialize an object from the file system. - * @param clazz Clazz to deserialize. - * @param file Path to read the deserialized YAML. - * @return Object deserialized from the file system. - */ - public static T load(Class clazz, File file) { - return load(clazz, file, representer); - } - - /** - * Deserialize an object from the file system. - * @param clazz Clazz to deserialize. - * @param file Path to read the deserialized YAML. - * @param representer Custom representer with rules on how to deserialize YAML. - * @return Object deserialized from the file system. - */ - @SuppressWarnings("unchecked") - public static T load(Class clazz, File file, Representer representer) { - Constructor constructor = new Constructor(clazz); - Yaml yaml = new Yaml(constructor, representer, options); - try { - return (T) yaml.load(new FileReader(file)); - } catch (IOException ioe) { - throw new UserException.CouldNotReadInputFile(file, ioe); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index f99a105ae3..8e218f9504 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -1,18 +1,12 @@ package org.broadinstitute.sting; -import org.apache.commons.io.FileUtils; import org.apache.log4j.*; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; -import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.testng.Assert; +import org.broadinstitute.sting.utils.io.IOUtils; -import javax.swing.*; import java.io.*; -import java.math.BigInteger; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.*; /** @@ -78,8 +72,8 @@ public abstract class BaseTest { public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; - public static final String networkTempDir = "/broad/shptmp/"; - public static final File networkTempDirFile = new File(networkTempDir); + public static final String networkTempDir; + public static final File networkTempDirFile; public static final File testDirFile = new File("public/testdata/"); public static final String testDir = testDirFile.getAbsolutePath() + "/"; @@ -99,6 +93,10 @@ public abstract class BaseTest { // Set the Root logger to only output warnings. logger.setLevel(Level.WARN); + networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File("/broad/shptmp/" + System.getProperty("user.name"))); + networkTempDirFile.deleteOnExit(); + networkTempDir = networkTempDirFile.getAbsolutePath() + "/"; + // find our file sources // if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { // logger.fatal("We can't locate the reference directories. Aborting!"); @@ -134,7 +132,7 @@ public abstract class BaseTest { */ public static class TestDataProvider { private static final Map> tests = new HashMap>(); - private final String name; + private String name; /** * Create a new TestDataProvider instance bound to the class variable C @@ -151,6 +149,10 @@ public TestDataProvider(Class c) { this(c, ""); } + public void setName(final String name) { + this.name = name; + } + /** * Return all of the data providers in the form expected by TestNG of type class C * @param c @@ -229,17 +231,12 @@ public static File createTempFile(String name, String extension) { /** * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Prefix of the file. - * @param extension Extension to concat to the end of the file. - * @return A file in the network temporary directory starting with name, ending with extension, which will be deleted after the program exits. + * @param name Name of the file. + * @return A file in the network temporary directory with name, which will be deleted after the program exits. */ - public static File createNetworkTempFile(String name, String extension) { - try { - File file = File.createTempFile(name, extension, networkTempDirFile); - file.deleteOnExit(); - return file; - } catch (IOException ex) { - throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex); - } + public static File createNetworkTempFile(String name) { + File file = new File(networkTempDirFile, name); + file.deleteOnExit(); + return file; } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index 8d7dd82ac3..17a7d1974d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -49,7 +50,7 @@ public void emptyAlignmentContextTest() { SAMRecordIterator iterator = new SAMRecordIterator(); GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); - Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); + Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 5ee373e4ff..5da8cebf47 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -72,7 +72,6 @@ public void timeDownsampling(int reps) { reader.getFileHeader(), false, SAMFileReader.ValidationStringency.SILENT, - 0, downsampling.create(), new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), Collections.emptyList(), diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java index dc3a6cafe3..62c93bdddd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/MockLocusShard.java @@ -26,6 +26,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -42,7 +43,7 @@ public class MockLocusShard extends LocusShard { public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { super( genomeLocParser, - new SAMDataSource(Collections.emptyList(),genomeLocParser), + new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser), intervals, null); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java deleted file mode 100755 index e41a6b3b71..0000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMBAMDataSourceUnitTest.java +++ /dev/null @@ -1,223 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.reads; - -import static org.testng.Assert.fail; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.AfterMethod; -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; - -/** - * - * User: aaron - * Date: Apr 8, 2009 - * Time: 8:14:23 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 - *

- * Class SAMBAMDataSourceUnitTest - *

- * The test of the SAMBAM simple data source. - */ -public class SAMBAMDataSourceUnitTest extends BaseTest { - - private List readers; - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - readers = new ArrayList(); - - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterMethod - public void undoForEachTest() { - seq = null; - readers.clear(); - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testLinearBreakIterateAll() { - logger.warn("Executing testLinearBreakIterateAll"); - - // setup the data - readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); - logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); - logger.debug("count = " + count); - StingSAMIterator datum = data.seek(sh); - - // for the first couple of shards make sure we can see the reads - if (count < 5) { - for (SAMRecord r : datum) { - } - readCount++; - } - datum.close(); - - // if we're over 100 shards, break out - if (count > 100) { - break; - } - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testMergingTwoBAMFiles() { - logger.warn("Executing testMergingTwoBAMFiles"); - - // setup the test files - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers,genomeLocParser); - ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000,genomeLocParser); - - ArrayList readcountPerShard = new ArrayList(); - ArrayList readcountPerShard2 = new ArrayList(); - - // count up the first hundred shards - int shardsToCount = 100; - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - - } - readcountPerShard.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - - // setup the data and the counter before our second run - readers.clear(); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - readers.add(new SAMReaderID(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - count = 0; - // the sharding strat. - data = new SAMDataSource(readers,genomeLocParser); - strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000, genomeLocParser); - - logger.debug("Pile two:"); - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - // can we leave? - if (count > shardsToCount) { - break; - } - - StingSAMIterator datum = data.seek(sh); - - for (SAMRecord r : datum) { - readCount++; - } - - readcountPerShard2.add(readCount); - logger.debug("read count = " + readCount); - datum.close(); - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - - /*int pos = 0; - for (; pos < 100; pos++) { - if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) { - fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos)); - } - } */ - - } - - - - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java new file mode 100755 index 0000000000..ba2d68ec96 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import static org.testng.Assert.fail; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * @author aaron + * @version 1.0 + * @date Apr 8, 2009 + *

+ * Class SAMDataSourceUnitTest + *

+ * The test of the SAMBAM simple data source. + */ +public class SAMDataSourceUnitTest extends BaseTest { + + private List readers; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + readers = new ArrayList(); + + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @AfterMethod + public void undoForEachTest() { + seq = null; + readers.clear(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testLinearBreakIterateAll() { + logger.warn("Executing testLinearBreakIterateAll"); + + // setup the data + readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); + + // the sharding strat. + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false, + false); + + Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); + int count = 0; + + try { + for (Shard sh : strat) { + int readCount = 0; + count++; + + GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); + logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); + logger.debug("count = " + count); + StingSAMIterator datum = data.seek(sh); + + // for the first couple of shards make sure we can see the reads + if (count < 5) { + for (SAMRecord r : datum) { + } + readCount++; + } + datum.close(); + + // if we're over 100 shards, break out + if (count > 100) { + break; + } + } + } + catch (UserException.CouldNotReadInputFile e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index c9727d904d..4011594f32 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -301,7 +301,6 @@ private static ReadProperties createTestReadProperties() { false, SAMFileReader.ValidationStringency.STRICT, null, - null, new ValidationExclusion(), Collections.emptyList(), false, diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java index 1e39fd26f7..91c18078e7 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -36,9 +36,11 @@ import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.*; import java.util.*; @@ -65,9 +67,9 @@ public void beforeClass() { C = Allele.create("C"); G = Allele.create("G"); T = Allele.create("T"); - AC_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)); - AG_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)); - AT_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)); + AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)).make(); + AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)).make(); + AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)).make(); span10_10 = makeSpan(10, 10); span1_20 = makeSpan(1, 20); span10_20 = makeSpan(10, 20); diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index 02e1ba99af..c9b81a9d35 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -26,6 +26,7 @@ import org.broadinstitute.sting.BaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; public class GATKReportUnitTest extends BaseTest { @@ -44,12 +45,32 @@ public void testParse() throws Exception { Assert.assertEquals(validationReport.getVersion(), GATKReportVersion.V0_1); Object validationReportPK = countVariants.getPrimaryKey("none.eval.none.known"); Assert.assertEquals(validationReport.get(validationReportPK, "sensitivity"), "NaN"); + } - GATKReportTable simpleMetricsByAC = report.getTable("SimpleMetricsByAC.metrics"); - Assert.assertEquals(simpleMetricsByAC.getVersion(), GATKReportVersion.V0_1); - Object simpleMetricsByACPK = simpleMetricsByAC.getPrimaryKey("none.eval.none.novel.ac2"); - Assert.assertEquals(simpleMetricsByAC.get(simpleMetricsByACPK, "AC"), "2"); + @DataProvider(name = "rightAlignValues") + public Object[][] getRightAlignValues() { + return new Object[][] { + new Object[] {null, true}, + new Object[] {"null", true}, + new Object[] {"NA", true}, + new Object[] {"0", true}, + new Object[] {"0.0", true}, + new Object[] {"-0", true}, + new Object[] {"-0.0", true}, + new Object[] {String.valueOf(Long.MAX_VALUE), true}, + new Object[] {String.valueOf(Long.MIN_VALUE), true}, + new Object[] {String.valueOf(Float.MIN_NORMAL), true}, + new Object[] {String.valueOf(Double.MAX_VALUE), true}, + new Object[] {String.valueOf(Double.MIN_VALUE), true}, + new Object[] {String.valueOf(Double.POSITIVE_INFINITY), true}, + new Object[] {String.valueOf(Double.NEGATIVE_INFINITY), true}, + new Object[] {String.valueOf(Double.NaN), true}, + new Object[] {"hello", false} + }; + } - Assert.assertFalse(simpleMetricsByAC.containsPrimaryKey("none.eval.none.novel.ac2.bad")); + @Test(dataProvider = "rightAlignValues") + public void testIsRightAlign(String value, boolean expected) { + Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index d498ee61a1..7f21da4f4d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -27,11 +27,42 @@ public class SampleDBUnitTest extends BaseTest { new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED))); + private static final Set testPEDFamilyF2 = new HashSet(Arrays.asList( + new Sample("s2", "fam2", "d2", "m2", Gender.FEMALE, Affection.AFFECTED), + new Sample("d2", "fam2", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("m2", "fam2", null, null, Gender.FEMALE, Affection.UNKNOWN) + )); + + private static final Set testPEDFamilyF3 = new HashSet(Arrays.asList( + new Sample("s1", "fam3", "d1", "m1", Gender.FEMALE, Affection.AFFECTED), + new Sample("d1", "fam3", null, null, Gender.FEMALE, Affection.UNKNOWN), + new Sample("m1", "fam3", null, null, Gender.FEMALE, Affection.UNKNOWN) + )); + private static final Set testSAMSamples = new HashSet(Arrays.asList( new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN))); + private static final HashMap> testGetFamilies = new HashMap>(); + static { + testGetFamilies.put("fam1", testPEDSamples); + testGetFamilies.put("fam2", testPEDFamilyF2); + testGetFamilies.put("fam3", testPEDFamilyF3); + } + + private static final Set testKidsWithParentsFamilies2 = new HashSet(Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("kid3", "fam5", "dad2", "mom2", Gender.MALE, Affection.AFFECTED), + new Sample("kid2", "fam5", "dad2", "mom2", Gender.MALE, Affection.AFFECTED))); + + private static final HashSet testGetPartialFamiliesIds = new HashSet(Arrays.asList("kid","s1")); + private static final HashMap> testGetPartialFamilies = new HashMap>(); + static { + testGetPartialFamilies.put("fam1", new HashSet(Arrays.asList(new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED)))); + testGetPartialFamilies.put("fam3", new HashSet(Arrays.asList(new Sample("s1", "fam3", "d1", "m1", Gender.FEMALE, Affection.AFFECTED)))); + } + private static final String testPEDString = String.format("%s%n%s%n%s", "fam1 kid dad mom 1 2", @@ -46,6 +77,18 @@ public class SampleDBUnitTest extends BaseTest { "fam3 s1 d1 m1 2 2", "fam2 s2 d2 m2 2 2"); + private static final String testPEDMultipleFamilies2 = + String.format("%s%n%s%n%s%n%s%n%s%n%s%n%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2", + "fam4 kid4 dad4 0 1 2", + "fam4 dad4 0 0 1 1", + "fam5 kid2 dad2 mom2 1 2", + "fam5 kid3 dad2 mom2 1 2", + "fam5 dad2 0 0 1 1", + "fam5 mom2 0 0 2 2"); + private static final String testPEDStringInconsistentGender = "fam1 kid 0 0 2 2"; @@ -138,6 +181,25 @@ public void getFamily() { Assert.assertEquals(db.getFamily("fam1"), testPEDSamplesAsSet); } + @Test() + public void getFamilies(){ + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamilies(),testGetFamilies); + Assert.assertEquals(db.getFamilies(null),testGetFamilies); + Assert.assertEquals(db.getFamilies(testGetPartialFamiliesIds),testGetPartialFamilies); + } + + @Test() + public void testGetChildrenWithParents() + { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies2)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getChildrenWithParents(), testKidsWithParentsFamilies2); + Assert.assertEquals(db.getChildrenWithParents(false), testKidsWithParentsFamilies2); + Assert.assertEquals(db.getChildrenWithParents(true), new HashSet(Arrays.asList(new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED)))); + } + @Test() public void loadFamilyIDs() { builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 7f4d96adde..9226f97e2e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -5,14 +5,13 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -66,7 +65,6 @@ public class TraverseReadsUnitTest extends BaseTest { private List bamList; private Walker countReadWalker; private File output; - private long readSize = 100000; private TraverseReads traversalEngine = null; private IndexedFastaSequenceFile ref = null; @@ -117,18 +115,14 @@ public void doForEachTest() { /** Test out that we can shard the file and iterate over every read */ @Test public void testUnmappedReadCount() { - SAMDataSource dataSource = new SAMDataSource(bamList,genomeLocParser); - ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref, ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, - ref.getSequenceDictionary(), - readSize, - genomeLocParser); + SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); - while (shardStrategy.hasNext()) { + for(Shard shard: shardStrategy) { traversalEngine.startTimersIfNecessary(); - Shard shard = shardStrategy.next(); if (shard == null) { fail("Shard == null"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index a01a3f2817..e26d6174bc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -33,5 +33,6 @@ public void testExtendedEventPileup() { WalkerTestSpec spec = new WalkerTestSpec(gatk_args,1,Arrays.asList(expected_md5)); executeTest("Testing the extended pileup with indel records included on a small chunk of Ovarian dataset with 20 indels (1 D, 19 I)", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java index 462abeba11..5c8fa32a88 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -33,7 +33,7 @@ public class SnpEffUnitTest { @Test public void testParseWellFormedEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); @@ -42,7 +42,7 @@ public void testParseWellFormedEffect() { @Test public void testParseInvalidEffectNameEffect() { String effectName = "MADE_UP_EFFECT"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -51,7 +51,7 @@ public void testParseInvalidEffectNameEffect() { @Test public void testParseInvalidEffectImpactEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MEDIUM", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -60,27 +60,27 @@ public void testParseInvalidEffectImpactEffect() { @Test public void testParseWrongNumberOfMetadataFieldsEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); } @Test - public void testParseSnpEffWarningEffect() { + public void testParseSnpEffOneWarningOrErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_OR_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning or error: \"SNPEFF_WARNING_OR_ERROR_TEXT\"") ); } @Test - public void testParseSnpEffErrorEffect() { + public void testParseSnpEffBothWarningAndErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_TEXT", "SNPEFF_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: \"SNPEFF_WARNING_TEXT\", and the following error: \"SNPEFF_ERROR_TEXT\"") ); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 8e887c32a9..8b101d1d5c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public void testHasAnnotsNotAsking2() { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("8e7de435105499cd71ffc099e268a83e")); + Arrays.asList("e70eb5f80c93e366dcbe3cf684c154e4")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public void testHasAnnotsAsking1() { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("64b6804cb1e27826e3a47089349be581")); + Arrays.asList("2977bb30c8b84a5f4094fe6090658561")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -54,6 +54,8 @@ public void testNoAnnotsNotAsking1() { @Test public void testNoAnnotsNotAsking2() { + // this genotype annotations in this file are actually out of order. If you don't parse the genotypes + // they don't get reordered. It's a good test of the genotype ordering system. WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("f2ddfa8105c290b1f34b7a261a02a1ac")); @@ -64,7 +66,7 @@ public void testNoAnnotsNotAsking2() { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fd1ffb669800c2e07df1e2719aa38e49")); + Arrays.asList("1e52761fdff73a5361b5eb0a6e5d9dad")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -72,7 +74,7 @@ public void testNoAnnotsAsking1() { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("09f8e840770a9411ff77508e0ed0837f")); + Arrays.asList("0948cd1dba7d61f283cc4cf2a7757d92")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -80,7 +82,7 @@ public void testNoAnnotsAsking2() { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b49fe03aa4b675db80a9db38a3552c95")); + Arrays.asList("bb4eebfaffc230cb8a31e62e7b53a300")); executeTest("test exclude annotations", spec); } @@ -88,7 +90,7 @@ public void testExcludeAnnotations() { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("78d2c19f8107d865970dbaf3e12edd92")); + Arrays.asList("062155edec46a8c52243475fbf3a2943")); executeTest("test overwriting header", spec); } @@ -96,7 +98,7 @@ public void testOverwritingHeader() { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("16e3a1403fc376320d7c69492cad9345")); + Arrays.asList("06635f2dd91b539bfbce9bf7914d8e43")); executeTest("not passing it any reads", spec); } @@ -104,7 +106,7 @@ public void testNoReads() { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d")); + Arrays.asList("820eeba1f6e3a0758a69d937c524a38e")); executeTest("getting DB tag with dbSNP", spec); } @@ -112,7 +114,7 @@ public void testDBTagWithDbsnp() { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688")); + Arrays.asList("31cc2ce157dd20771418c08d6b3be1fa")); executeTest("getting DB tag with HM3", spec); } @@ -120,10 +122,18 @@ public void testDBTagWithHapMap() { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -L " + validationDataLocation + "vcfexample3empty.vcf", 1, - Arrays.asList("ae30a1ac7bfbc3d22a327f8b689cad31")); + Arrays.asList("074865f8f8c0ca7bfd58681f396c49e9")); executeTest("using expression", spec); } + @Test + public void testUsingExpressionWithID() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1, + Arrays.asList("97b26db8135d083566fb585a677fbe8a")); + executeTest("using expression with ID", spec); + } + @Test public void testTabixAnnotations() { final String MD5 = "13269d5a2e16f06fd755cc0fb9271acf"; @@ -135,19 +145,19 @@ public void testTabixAnnotations() { } } - @Test + @Test(enabled = false) public void testSnpEffAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("122321a85e448f21679f6ca15c5e22ad") + Arrays.asList("51258f5c880bd1ca3eb45a1711335c66") ); executeTest("Testing SnpEff annotations", spec); } - @Test + @Test(enabled = false) public void testSnpEffAnnotationsUnsupportedVersion() { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + @@ -158,4 +168,15 @@ public void testSnpEffAnnotationsUnsupportedVersion() { ); executeTest("Testing SnpEff annotations (unsupported version)", spec); } + + @Test + public void testTDTAnnotation() { + final String MD5 = "204e67536a17af7eaa6bf0a910818997"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf -NO_HEADER -ped " + validationDataLocation + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing TDT annotation", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java index 1f3f8ebe6a..02332b64e9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java @@ -32,13 +32,13 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; - final static String SUMMARY_MD5 = "ed4c255bb78313b8e7982127caf3d6c4"; + final static String SUMMARY_MD5 = "ffdbd9cdcb4169ebed5ae4bec797260f"; @Test public void testCallableLociWalkerBed() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("884c9c2d96419d990a708d2bd98fcefa", SUMMARY_MD5)); + Arrays.asList("9e4ec9c23f21a8162d27a39ab057398c", SUMMARY_MD5)); executeTest("formatBed", spec); } @@ -46,13 +46,13 @@ public void testCallableLociWalkerBed() { public void testCallableLociWalkerPerBase() { String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("fb4524f8b3b213060c0c5b85362b5902", SUMMARY_MD5)); + Arrays.asList("e6044b4495ef24f542403e6a94437068", SUMMARY_MD5)); executeTest("format_state_per_base", spec); } @Test public void testCallableLociWalker2() { - String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100;1:10,000,110-10,000,120 -summary %s"; + String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100 -L 1:10,000,110-10,000,120 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, Arrays.asList("c671f65712d9575b8b3e1f1dbedc146e", "d287510eac04acf5a56f5cde2cba0e4a")); executeTest("formatBed by interval", spec); @@ -62,7 +62,7 @@ public void testCallableLociWalker2() { public void testCallableLociWalker3() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("86bd1a5f79356b3656412c4b1c60709a", "6fefb144a60b89c27293ce5ca6e10e6a")); + Arrays.asList("4496551d4493857e5153d8172965e527", "b0667e31af9aec02eaf73ca73ec16937")); executeTest("formatBed lots of arguments", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java index 043b2eaf23..84603f066c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java @@ -69,6 +69,7 @@ public void testMapQ0Only() { spec.addAuxFile("347b47ef73fbd4e277704ddbd7834f69", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); spec.addAuxFile("4ec920335d4b9573f695c39d62748089", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + execute("testMapQ0Only",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 646fb5e779..f2f72978f2 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -55,26 +55,26 @@ public void testBaseOutputNoFiltering() { spec.setOutputFileLocation(baseOutputFile); // now add the expected files that get generated - spec.addAuxFile("423571e4c05e7934322172654ac6dbb7", baseOutputFile); - spec.addAuxFile("9df5e7e07efeb34926c94a724714c219", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); - spec.addAuxFile("229b9b5bc2141c86dbc69c8acc9eba6a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); + spec.addAuxFile("2f072fd8b41b5ac1108797f89376c797", baseOutputFile); + spec.addAuxFile("d17ac7cc0b58ba801d2b0727a363d615", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); + spec.addAuxFile("c05190c9e6239cdb1cd486edcbc23505", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); spec.addAuxFile("9cd395f47b329b9dd00ad024fcac9929", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("471c34ad2e4f7228efd20702d5941ba9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); - spec.addAuxFile("9667c77284c2c08e647b162d0e9652d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("5a96c75f96d6fa6ee617451d731dae37", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); - spec.addAuxFile("b82846df660f0aac8429aec57c2a62d6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); - spec.addAuxFile("d32a8c425fadcc4c048bd8b48d0f61e5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); + spec.addAuxFile("c94a52b4e73a7995319e0b570c80d2f7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("1970a44efb7ace4e51a37f0bd2dc84d1", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("c321c542be25359d2e26d45cbeb6d7ab", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("9023cc8939777d515cd2895919a99688", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); + spec.addAuxFile("3597b69e90742c5dd7c83fbc74d079f3", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); - spec.addAuxFile("2aae346204c5f15517158da8e61a6c16", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); - spec.addAuxFile("e70952f241eebb9b5448f2e7cb288131", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); - spec.addAuxFile("054ed1e184f46d6a170dc9bf6524270c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); - spec.addAuxFile("d53431022f7387fe9ac47814ab1fcd88", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("a395dafde101971d2b9e5ddb6cd4b7d0", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("1a6ea3aa759fb154ccc4e171ebca9d02", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); + spec.addAuxFile("b492644ff06b4ffb044d5075cd168abf", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); + spec.addAuxFile("77cef87dc4083a7b60b7a7b38b4c0bd8", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); + spec.addAuxFile("8e1adbe37b98bb2271ba13932d5c947f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); + spec.addAuxFile("761d2f9daf2ebaf43abf65c8fd2fcd05", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("e013cb5b11b0321a81c8dbd7c1863787", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("661160f571def8c323345b5859cfb9da", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("c95a7a6840334cadd0e520939615c77b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); - + spec.addAuxFile("0582b4681dbc02ece2dfe2752dcfd228", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("0685214965bf1863f7ce8de2e38af060", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("7a0cd8a5ebaaa82621fd3b5aed9c32fe", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + execute("testBaseOutputNoFiltering",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index c8a25c97bf..9b79653c61 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -50,8 +50,8 @@ public String toString() { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "ed377322c615abc7dceb97025076078d"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "02e46f5d2ebb3d49570850595b3f792e"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "da3dc85a0e35a9aade5520591891b4fa"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "7dc8200730313e6753237a696296fb73"); return TestParams.getTests(TestParams.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java index 9af39e92cc..1c5db4262e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java @@ -12,25 +12,25 @@ public void testIntervals() { String md5_1 = "328d2d52cedfdc52da7d1abff487633d"; WalkerTestSpec spec1a = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s", 1, Arrays.asList(md5_1)); executeTest("testFastaReference", spec1a); WalkerTestSpec spec1b = new WalkerTestSpec( - "-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s", + "-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s", 1, Arrays.asList(md5_1)); executeTest("testFastaReference", spec1b); WalkerTestSpec spec2 = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s", 1, Arrays.asList("0567b32ebdc26604ddf2a390de4579ac")); executeTest("testFastaAlternateReferenceIndels", spec2); WalkerTestSpec spec3 = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500;1:10,029,200-10,029,500 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s", 1, Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641")); executeTest("testFastaAlternateReferenceSnps", spec3); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 1cb43ceb1a..2c04cebd43 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -29,17 +29,23 @@ public void testClusteredSnps() { } @Test - public void testMasks() { + public void testMask1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask:VCF3 " + validationDataLocation + "vcfexample2.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("578f9e774784c25871678e6464fd212b")); executeTest("test mask all", spec1); + } + @Test + public void testMask2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -maskName foo --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("bfa86a674aefca1b13d341cb14ab3c4f")); executeTest("test mask some", spec2); + } + @Test + public void testMask3() { WalkerTestSpec spec3 = new WalkerTestSpec( baseTestString() + " -maskName foo -maskExtend 10 --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("5939f80d14b32d88587373532d7b90e5")); @@ -71,12 +77,15 @@ public void testFilterWithSeparateNames() { } @Test - public void testGenotypeFilters() { + public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("96b61e4543a73fe725e433f007260039")); executeTest("test genotype filter #1", spec1); + } + @Test + public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("6c8112ab17ce39c8022c891ae73bf38e")); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java new file mode 100644 index 0000000000..6f259f6992 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -0,0 +1,108 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + + +public class ExactAFCalculationModelUnitTest extends BaseTest { + + static double[] AA1, AB1, BB1; + static double[] AA2, AB2, AC2, BB2, BC2, CC2; + static final int numSamples = 3; + static double[][] priors = new double[2][2*numSamples+1]; // flat priors + + @BeforeSuite + public void before() { + AA1 = new double[]{0.0, -20.0, -20.0}; + AB1 = new double[]{-20.0, 0.0, -20.0}; + BB1 = new double[]{-20.0, -20.0, 0.0}; + AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; + AC2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; + BB2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; + BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; + CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; + } + + private class GetGLsTest extends TestDataProvider { + GenotypesContext GLs; + int numAltAlleles; + String name; + + private GetGLsTest(String name, int numAltAlleles, Genotype... arg) { + super(GetGLsTest.class, name); + GLs = GenotypesContext.create(arg); + this.name = name; + this.numAltAlleles = numAltAlleles; + } + + public String toString() { + return String.format("%s input=%s", super.toString(), GLs); + } + } + + private static Genotype createGenotype(String name, double[] gls) { + return new Genotype(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), Genotype.NO_LOG10_PERROR, gls); + } + + @DataProvider(name = "getGLs") + public Object[][] createGLsData() { + + // bi-allelic case + new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1)); + new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1)); + new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1)); + new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1)); + new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1)); + new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1)); + new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1)); + new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1)); + + // tri-allelic case + new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2)); + new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2)); + new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2)); + new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2)); + new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2)); + new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2)); + new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2)); + + return GetGLsTest.getTests(GetGLsTest.class); + } + + + @Test(dataProvider = "getGLs") + public void testGLs(GetGLsTest cfg) { + + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); + for ( int i = 0; i < 2; i++ ) { + for ( int j = 0; j < 2*numSamples+1; j++ ) { + result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + } + } + + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); + + int nameIndex = 1; + for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { + int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); + int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); + + if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) { + Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero); + } else { + Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java index 425b969e23..a87f121f69 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java @@ -7,7 +7,7 @@ import static java.lang.Math.log10; -public class GenotypeLikelihoodsUnitTest extends BaseTest { +public class GenotypePriorsUnitTest extends BaseTest { private final static double DELTA = 1e-8; @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b80f214b1a..f7d6af3a74 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -5,10 +5,8 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; -import java.io.File; import java.util.Arrays; import java.util.HashMap; -import java.util.List; import java.util.Map; // ********************************************************************************** // @@ -30,20 +28,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("b27939251539439a382538e507e03507")); + Arrays.asList("66ed60c6c1190754abd8a0a9d1d8d61e")); executeTest("test MultiSample Pilot1", spec); } @Test - public void testWithAllelesPassedIn() { + public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("8de2602679ffc92388da0b6cb4325ef6")); + Arrays.asList("ea5b5dcea3a6eef7ec60070b551c994e")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); + } + @Test + public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("6458f3b8fe4954e2ffc2af972aaab19e")); + Arrays.asList("43e7a17d95b1a0cf72e669657794d802")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -51,7 +52,7 @@ public void testWithAllelesPassedIn() { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("6762b72ae60155ad71738d7c76b80e4b")); + Arrays.asList("ae29b9c9aacce8046dc780430540cd62")); executeTest("test SingleSample Pilot2", spec); } @@ -61,7 +62,7 @@ public void testSingleSamplePilot2() { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "bc71dba7bbdb23e7d5cc60461fdd897b"; + private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65"; @Test public void testCompressedOutput() { @@ -82,7 +83,7 @@ public void testParallelization() { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "b9504e446b9313559c3ed97add7e8dc1"; + String md5 = "32a34362dff51d8b73a3335048516d82"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -113,8 +114,8 @@ public void testParallelization() { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "bb3f294eab3e2cf52c70e63b23aac5ee" ); - e.put( "--computeSLOD", "eb34979efaadba1e34bd82bcacf5c722" ); + e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); + e.put( "--computeSLOD", "6172d2f3d370132f4c57a26aa94c256e" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -127,9 +128,9 @@ public void testCallingParameters() { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "d40114aa201aa33ff5f174f15b6b73af" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "3c681b053fd2280f3c42041d24243752" ); - e.put( "--output_mode EMIT_ALL_SITES", "eafa6d71c5ecd64dfee5d7a3f60e392e" ); + e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" ); + e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -143,12 +144,15 @@ public void testOutputParameter() { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("c71ca370947739cb7d87b59452be7a07")); + Arrays.asList("902327e8a45fe585c8dfd1a7c4fcf60f")); executeTest("test confidence 1", spec1); + } + @Test + public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("1c0a599d475cc7d5e745df6e9b6c0d29")); + Arrays.asList("2343ac8113791f4e79643b333b34afc8")); executeTest("test confidence 2", spec2); } @@ -160,8 +164,8 @@ public void testConfidence() { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "f84da90c310367bd51f2ab6e346fa3d8" ); - e.put( 1.0 / 1850, "5791e7fef40d4412b6d8f84e0a809c6c" ); + e.put( 0.01, "2cb2544739e01f6c08fd820112914317" ); + e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -185,7 +189,7 @@ public void testMultiTechnologies() { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("9cc9538ac83770e12bd0830d285bfbd0")); + Arrays.asList("2b2729414ae855d390e7940956745bce")); executeTest(String.format("test multiple technologies"), spec); } @@ -204,7 +208,7 @@ public void testCallingWithBAQ() { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("eaf8043edb46dfbe9f97ae03baa797ed")); + Arrays.asList("95c6120efb92e5a325a5cec7d77c2dab")); executeTest(String.format("test calling with BAQ"), spec); } @@ -223,7 +227,7 @@ public void testSimpleIndels() { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d")); + Arrays.asList("b11df6587e4e16cb819d76a900446946")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -238,7 +242,7 @@ public void testIndelsWithLowMinAlleleCnt() { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("5fe98ee853586dc9db58f0bc97daea63")); + Arrays.asList("2ad52c2e75b3ffbfd8f03237c444e8e6")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -251,7 +255,7 @@ public void testMultiTechnologyIndels() { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24")); + Arrays.asList("59068bc8888ad5f08790946066d76602")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -261,7 +265,7 @@ public void testWithIndelAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("118918f2e9e56a3cfc5ccb2856d529c8")); + Arrays.asList("fa4f3ee67d98b64102a8a3ec81a3bc81")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -271,7 +275,7 @@ public void testWithIndelAllelesPassedIn2() { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("a20799237accd52c1b8c2ac096309c8f")); + Arrays.asList("df90890e43d735573a3b3e4f289ca46b")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -281,7 +285,7 @@ public void testWithIndelAllelesPassedIn3() { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("18ef8181157b4ac3eb8492f538467f92")); + Arrays.asList("cff6dd0f4eb1ef0b6fc476da6ffead19")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -290,11 +294,11 @@ public void testWithIndelAllelesPassedIn4() { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("ad884e511a751b05e64db5314314365a")); - executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); + Arrays.asList("1e2a4aab26e9ab0dae709d33a669e036")); + executeTest("test MultiSample Phase1 indels with complicated records", spec4); } - @Test + @Test(enabled = false) public void testSnpEffAnnotationRequestedWithoutRodBinding() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypesIntegrationTest.java deleted file mode 100644 index cf6b4e5811..0000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypesIntegrationTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class MergeAndMatchHaplotypesIntegrationTest extends WalkerTest { - private static String mergeAndMatchHaplotypesTestDataRoot = validationDataLocation + "/MergeAndMatchHaplotypes"; - private static String fundamentalTestPBTVCF = mergeAndMatchHaplotypesTestDataRoot + "/" + "FundamentalsTest.pbt.vcf"; - private static String fundamentalTestRBPVCF = mergeAndMatchHaplotypesTestDataRoot + "/" + "FundamentalsTest.pbt.rbp.vcf"; - - @Test - public void testBasicFunctionality() { - WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T MergeAndMatchHaplotypes", - "-R " + b37KGReference, - "--pbt " + fundamentalTestPBTVCF, - "--rbp " + fundamentalTestRBPVCF, - "-o %s" - ), - 1, - Arrays.asList("") - ); - executeTest("testBasicMergeAndMatchHaplotypesFunctionality", spec); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java deleted file mode 100644 index 2e4556af07..0000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class MergeMNPsIntegrationTest extends WalkerTest { - - public static String baseTestString(String reference, String VCF, int maxDistMNP) { - return "-T MergeMNPs" + - " -R " + reference + - " --variant:vcf " + validationDataLocation + VCF + - " --maxGenomicDistanceForMNP " + maxDistMNP + - " -o %s" + - " -NO_HEADER"; - } - - - @Test - public void test1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) - + " -L chr20:556259-756570", - 1, - Arrays.asList("7f11f7f75d1526077f0173c7ed1fc6c4")); - executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec); - } - - @Test - public void test2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) - + " -L chr20:556259-756570", - 1, - Arrays.asList("53dd312468296826bdd3c22387390c88")); - executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec); - } - - @Test - public void test3() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) - + " -L chr20:556259-756570", - 1, - Arrays.asList("e26f92d2fb9f4eaeac7f9d8ee27410ee")); - executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec); - } - - -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java deleted file mode 100644 index db1e4a82fd..0000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest { - - public static String baseTestString(String reference, String VCF, int maxDist) { - return "-T MergeSegregatingAlternateAlleles" + - " -R " + reference + - " --variant:vcf " + validationDataLocation + VCF + - " --maxGenomicDistance " + maxDist + - " -o %s" + - " -NO_HEADER"; - } - - - @Test - public void test1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) - + " -L chr20:556259-756570", - 1, - Arrays.asList("af5e1370822551c0c6f50f23447dc627")); - executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec); - } - - @Test - public void test2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) - + " -L chr20:556259-756570", - 1, - Arrays.asList("dd8c44ae1ef059a7fe85399467e102eb")); - executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec); - } - - @Test - public void test3() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) - + " -L chr20:556259-756570", - 1, - Arrays.asList("f81fd72ecaa57b3215406fcea860bcc5")); - executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec); - } - - -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index c663c1dd7c..2cd76e7a5c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -6,23 +6,131 @@ import java.util.Arrays; public class PhaseByTransmissionIntegrationTest extends WalkerTest { - private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission"; - private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf"; + private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/"; + private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped"; + private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf"; + private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf"; + private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf"; + private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf"; + //Tests using PbT on all genotypes with default parameters + //And all reporting options @Test - public void testBasicFunctionality() { + public void testTrueNegativeMV() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T PhaseByTransmission", "-NO_HEADER", "-R " + b37KGReference, - "--variant " + fundamentalTestVCF, - "-f NA12892+NA12891=NA12878", + "--variant " + TNTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14") + ); + executeTest("testTrueNegativeMV", spec); + } + + @Test + public void testTruePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + TPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556") + ); + executeTest("testTruePositiveMV", spec); + } + + @Test + public void testFalsePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee") + ); + executeTest("testFalsePositiveMV", spec); + } + + @Test + public void testSpecialCases() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + SpecialTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3") + ); + executeTest("testSpecialCases", spec); + } + + //Test using a different prior + //Here the FP file is used but as the prior is lowered, 3 turn to TP + @Test + public void testPriorOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-prior 1e-4", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e") + ); + executeTest("testPriorOption", spec); + } + + //Test when running without MV reporting option + //This is the exact same test file as FP but should not generate a .mvf file + @Test + public void testMVFileOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", "-o %s" ), 1, - Arrays.asList("") + Arrays.asList("398724bc1e65314cc5ee92706e05a3ee") ); - executeTest("testBasicFunctionality", spec); + executeTest("testMVFileOption", spec); } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/TestVariantContextWalker.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/TestVariantContextWalker.java deleted file mode 100755 index 7607049dbc..0000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/TestVariantContextWalker.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; - -/** - * Test routine for new VariantContext object - */ -@Reference(window=@Window(start=-20,stop=1)) -public class TestVariantContextWalker extends RodWalker { - @Output - PrintStream out; - - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - - @Argument(fullName="takeFirstOnly", doc="Only take the first second at a locus, as opposed to all", required=false) - boolean takeFirstOnly = false; - - @Argument(fullName="onlyContextsOfType", doc="Only take variant contexts of this type", required=false) - VariantContext.Type onlyOfThisType = null; - - @Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false) - boolean onlyContextsStartinAtCurrentPosition = false; - - @Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false) - boolean printContexts = false; - - @Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false) - VCFWriter writer = null; - - private boolean wroteHeader = false; - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ref == null ) - return 0; - else { - EnumSet allowedTypes = onlyOfThisType == null ? null : EnumSet.of(onlyOfThisType); - - int n = 0; - List contexts; - if ( onlyContextsStartinAtCurrentPosition ) - contexts = tracker.getValues(variantCollection.variants, context.getLocation()); - else // ! onlyContextsStartinAtCurrentPosition - contexts = tracker.getValues(variantCollection.variants); - - for ( VariantContext vc : contexts ) { - if ( allowedTypes == null || allowedTypes.contains(vc.getType()) ) { - // we need to trigger decoding of the genotype string to pass integration tests - vc.getGenotypes(); - - if ( writer != null && n == 0 ) { - if ( ! wroteHeader ) { - writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc)); - wroteHeader = true; - } - - writer.add(vc); - } - - n++; - if ( printContexts ) out.printf(" %s%n", vc); - if ( takeFirstOnly ) break; - } - } - - if ( n > 0 && printContexts ) { - out.printf("%s => had %d variant context objects%n", context.getLocation(), n); - out.printf("---------------------------------------------%n"); - } - - return n; - } - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer point, Integer sum) { - return point + sum; - } - - @Override - public void onTraversalDone(Integer result) { - // Double check traversal result to make count is the same. - // TODO: Is this check necessary? - out.println("[REDUCE RESULT] Traversal result is: " + result); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index cbcd5835f9..d45e663b03 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -5,11 +5,11 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.HashMap; -import java.util.Map; +import java.io.File; import java.util.Arrays; +import java.util.HashMap; import java.util.List; -import java.io.File; +import java.util.Map; public class RecalibrationWalkersIntegrationTest extends WalkerTest { static HashMap paramsFiles = new HashMap(); @@ -31,10 +31,11 @@ public String toString() { @DataProvider(name = "cctestdata") public Object[][] createCCTestData() { - new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "5a52b00d9794d27af723bcf93366681e" ); + + new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "ab4940a16ab990181bd8368c76b23853" ); new CCTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "17d4b8001c982a70185e344929cf3941"); - new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "714e65d6cb51ae32221a77ce84cbbcdc" ); - new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "64e9f17a1cf6fc04c1f2717c2d2eca67" ); + new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "36c0c467b6245c2c6c4e9c956443a154" ); + new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "955a8fa2ddb2b04c406766ccd9ac45cc" ); return CCTest.getTests(CCTest.class); } @@ -88,10 +89,11 @@ public String toString() { @DataProvider(name = "trtestdata") public Object[][] createTRTestData() { - new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0b7123ae9f4155484b68e4a4f96c5504" ); new TRTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "d04cf1f6df486e45226ebfbf93a188a5"); - new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "74314e5562c1a65547bb0edaacffe602" ); - new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "2a37c6001826bfabf87063b1dfcf594f" ); + new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b2f4757bc47cf23bd9a09f756c250787" ); + new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "502c7df4d4923c4d078b014bf78bed34" ); + return TRTest.getTests(TRTest.class); } @@ -121,7 +123,7 @@ public void testTableRecalibrator1(TRTest test) { @Test public void testCountCovariatesUseOriginalQuals() { HashMap e = new HashMap(); - e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "278846c55d97bd9812b758468a83f559"); + e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "0b88d0e8c97e83bdeee2064b6730abff"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -145,7 +147,7 @@ public void testCountCovariatesUseOriginalQuals() { @Test public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0b7123ae9f4155484b68e4a4f96c5504" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -200,7 +202,7 @@ public void testCountCovariatesSolidIndelsRemoveRefBias() { @Test public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "613fb2bbe01af8cbe6a188a10c1582ca" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -228,7 +230,7 @@ public void testTableRecalibratorSolidIndelsRemoveRefBias() { @Test public void testCountCovariatesBED() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "b460478d9683e827784e42bc352db8bb"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "7e973328751d233653530245d404a64d"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -252,7 +254,7 @@ public void testCountCovariatesBED() { @Test public void testCountCovariatesVCFPlusDBsnp() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "9131d96f39badbf9753653f55b148012"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "fd9e37879069aa6d84436c25e472b9e9"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -280,7 +282,7 @@ public void testCountCovariatesVCFPlusDBsnp() { @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "8993d32df5cb66c7149f59eccbd57f4c" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "828d247c6e8ef5ebdf3603dc0ce79f61" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -306,7 +308,7 @@ public void testCountCovariatesNoIndex() { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "5f913c98ca99754902e9d34f99df468f" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "13c83656567cee9e93bda9874ee80234" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index cd2493dde5..3ef4e5e9f4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -9,28 +9,28 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; - private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf"; + private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.NA12045.vcf"; private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; - @Test + @Test(enabled = false) public void testFunctionClassWithSnpeff() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-noST", "-ST FunctionalClass", - "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-o %s" ), 1, - Arrays.asList("d9dcb352c53106f54fcc981f15d35a90") + Arrays.asList("f909fd8374f663e983b9b3fda4cf5cf1") ); executeTest("testFunctionClassWithSnpeff", spec); } @@ -50,7 +50,7 @@ public void testStratifySamplesAndExcludeMonomorphicSites() { "-o %s" ), 1, - Arrays.asList("6a71b17c19f5914c277a99f45f5d9c39") + Arrays.asList("081fcaa532c7ba8f23da739389e6f7c3") ); executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); } @@ -70,7 +70,7 @@ public void testFundamentalsCountVariantsSNPsAndIndels() { "-o %s" ), 1, - Arrays.asList("1fefd6cf9c2554d5f886c3998defd4d0") + Arrays.asList("b3852f84d07c270b8a12874083c3e31b") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -91,7 +91,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithNovelty() { "-o %s" ), 1, - Arrays.asList("d470e00a368b5a0468012818994c6a89") + Arrays.asList("cf70468b5ebaec408419da69b0a7fcb9") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -113,7 +113,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithNoveltyAndFilter() { "-o %s" ), 1, - Arrays.asList("12856e52c2682328f91594089328596c") + Arrays.asList("5e3b8b85acfc41365c8208c23abf746b") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -134,7 +134,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithCpG() { "-o %s" ), 1, - Arrays.asList("91610b9240f64e0eb03cfd2602cf57af") + Arrays.asList("ccdbc50d30ece6d0d3b199c397f03ed3") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -155,7 +155,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithFunctionalClasses() { "-o %s" ), 1, - Arrays.asList("e40b77e7ed6581328e373a24b93cd170") + Arrays.asList("95c690d5af8ed51573eb2f0503dcd9c2") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -176,7 +176,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithDegeneracy() { "-o %s" ), 1, - Arrays.asList("15beaf3823c131cabc5fb0445239f978") + Arrays.asList("8e8547eb38b34bec0095b0500fd9641d") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -197,7 +197,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithSample() { "-o %s" ), 1, - Arrays.asList("7ddd4ee74938d229ce5cb7b9b9104abe") + Arrays.asList("158a4651a656aea7f84c79548f6fe519") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -220,7 +220,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithJexlExpression() { "-o %s" ), 1, - Arrays.asList("a90f33906a732ef5eb346e559c96ccc1") + Arrays.asList("76c8a0b28d2993644120f7afa5833ab2") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -245,7 +245,7 @@ public void testFundamentalsCountVariantsSNPsAndIndelsWithMultipleJexlExpression "-o %s" ), 1, - Arrays.asList("2567f90d3d7354850c5a59730ecc6e4f") + Arrays.asList("34682193f458b93b39efac00b4fc6723") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -264,7 +264,7 @@ public void testFundamentalsCountVariantsNoCompRod() { "-o %s" ), 1, - Arrays.asList("fa091aa8967893389c51102fd9f0bebb") + Arrays.asList("52f6655f1532bcea24b402010d93ce73") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -277,7 +277,7 @@ public void testSelect1() { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("f70997b6a3e7fdc89d11e1d61a2463d4")); + 1, Arrays.asList("4f60acc8a4b21c4b4ccb51ad9071449c")); executeTestParallel("testSelect1", spec); } @@ -287,14 +287,25 @@ public void testVEGenotypeConcordance() { WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -ST CpG --eval:VCF3 " + validationDataLocation + vcfFile + " --comp:VCF3 " + validationDataLocation + "GenotypeConcordanceComp.vcf -noEV -EV GenotypeConcordance -o %s", 1, - Arrays.asList("96f27163f16bb945f19c6623cd6db34e")); + Arrays.asList("9a56c20a7b9a554a7b530f2cb1dd776d")); executeTestParallel("testVEGenotypeConcordance" + vcfFile, spec); } + @Test + public void testVEMendelianViolationEvaluator() { + String vcfFile = "/MendelianViolationEval.vcf"; + String pedFile = "/MendelianViolationEval.ped"; + + WalkerTestSpec spec = new WalkerTestSpec("-T VariantEval -R "+b37KGReference+" --eval " + variantEvalTestDataRoot + vcfFile + " -ped "+ variantEvalTestDataRoot + pedFile +" -noEV -EV MendelianViolationEvaluator -L 1:10109-10315 -o %s -mvq 0 -noST", + 1, + Arrays.asList("66e72c887124f40933d32254b2dd44a3")); + executeTestParallel("testVEMendelianViolationEvaluator" + vcfFile, spec); + } + @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("407682de41dcf139ea635e9cda21b912")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("fa13eb59892892c07711c6ffe31bf870")); executeTestParallel("testCompVsEvalAC",spec); } @@ -312,7 +323,7 @@ public void testTranches() { @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("009ecc8376a20dce81ff5299ef6bfecb")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9002023b8aa8fc2c9aac58b8a79bca1e")); executeTestParallel("testCompOverlap",spec); } @@ -324,7 +335,7 @@ public void testEvalTrackWithoutGenotypes() { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("424c9d438b1faa59b2c29413ba32f37b")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("190e1a171132832bf92fbca56a9c40bb")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -336,7 +347,7 @@ public void testMultipleEvalTracksWithoutGenotypes() { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18fa0b89ebfff51141975d7e4ce7a159")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("08586d443fdcf3b7f63b8f9e3a943c62")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -353,13 +364,13 @@ public void testMultipleCompTracks() { " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("0b81d97f843ec4a1a4222d1f9949bfca")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("61052c19211e7eb61fbbb62db5e40b56")); executeTestParallel("testMultipleCompTracks",spec); } @Test - public void testPerSampleAndSubsettedSampleHaveSameResults() { - String md5 = "b0565ac61b2860248e4abd478a177b5e"; + public void testPerSampleAndSubsettedSampleHaveSameResults1() { + String md5 = "0edded1cd578db62fa296c99c34a909d"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( @@ -369,7 +380,7 @@ public void testPerSampleAndSubsettedSampleHaveSameResults() { "--eval " + fundamentalTestSNPsVCF, "-noEV", "-EV CompOverlap", - "-sn HG00625", + "-sn NA12045", "-noST", "-L " + fundamentalTestSNPsVCF, "-o %s" @@ -414,8 +425,29 @@ public void testAlleleCountStrat() { "-o %s" ), 1, - Arrays.asList("da65fc8f0d0eeaf0a0b06a07f444bb8e") + Arrays.asList("ee22604616b3e9fc48a6dcbbf73a056d") ); executeTest("testAlleleCountStrat", spec); } + + @Test + public void testIntervalStrat() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + testDir + "/withSymbolic.b37.vcf", + "-noEV", + "-EV CountVariants", + "-noST", + "-stratIntervals " + testDir + "/overlapTest.bed", + "-ST IntervalStratification", + "-L 20", + "-o %s" + ), + 1, + Arrays.asList("240369cd651c77e05e8a6659f4a6237e") + ); + executeTest("testIntervalStrat", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 6e994be3a0..042de2a27d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -115,4 +115,39 @@ public void testUsingDbsnpName() { executeTest("testUsingDbsnpName--" + testFile, spec); } + + @Test + public void testMultipleRecordsAtOnePosition() { + String testFile = validationDataLocation + "selectVariants.onePosition.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("20b52c96f5c48258494d072752b53693") + ); + + executeTest("testMultipleRecordsAtOnePositionFirstIsFiltered--" + testFile, spec); + } + + @Test + public void testParallelization() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + WalkerTestSpec spec; + + spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), + 1, + Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") + ); + executeTest("testParallelization (2 threads)--" + testfile, spec); + + spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), + 1, + Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") + ); + + executeTest("testParallelization (4 threads)--" + testfile, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 00044f8593..16b6c97d0e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -98,7 +98,7 @@ public void testVCFStreamingChain() throws IOException { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("d46a735ffa898f4aa6b3758c5b03f06d") + Arrays.asList("addf5f4596ddacef40808f6d3d281111") ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java index 48f4c37778..f68a96d265 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java @@ -62,7 +62,7 @@ public void testSubmitEcho() throws Exception { return; } - File outFile = createNetworkTempFile("JnaSessionIntegrationTest-", ".out"); + File outFile = createNetworkTempFile("JnaSessionIntegrationTest.out"); Session session = factory.getSession(); session.init(null); try { diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java index d98281ad3e..4c7d4ce061 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java @@ -86,7 +86,7 @@ public void testDrmaa() throws Exception { @Test(dependsOnMethods = { "testDrmaa" }) public void testSubmitEcho() throws Exception { - if (implementation.indexOf("LSF") >= 0) { + if (implementation.contains("LSF")) { System.err.println(" *********************************************************"); System.err.println(" ***********************************************************"); System.err.println(" **** ****"); @@ -101,7 +101,7 @@ public void testSubmitEcho() throws Exception { Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); int errnum; - File outFile = createNetworkTempFile("LibDrmaaIntegrationTest-", ".out"); + File outFile = createNetworkTempFile("LibDrmaaIntegrationTest.out"); errnum = LibDrmaa.drmaa_init(null, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java index b4fb5cfa34..21339eb46d 100644 --- a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java @@ -93,7 +93,7 @@ public void testReadQueueLimits() { @Test public void testSubmitEcho() throws Exception { String queue = "hour"; - File outFile = createNetworkTempFile("LibBatIntegrationTest-", ".out"); + File outFile = createNetworkTempFile("LibBatIntegrationTest.out"); submit req = new submit(); diff --git a/public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java b/public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java deleted file mode 100644 index 8913566704..0000000000 --- a/public/java/test/org/broadinstitute/sting/pipeline/PipelineUnitTest.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.pipeline; - -import org.broadinstitute.sting.pipeline.Pipeline; -import org.broadinstitute.sting.pipeline.PipelineSample; -import org.testng.Assert; -import org.broadinstitute.sting.utils.yaml.YamlUtils; - -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Map; - -public class PipelineUnitTest { - @Test - public void testDumpAndLoad() throws Exception { - Pipeline pipeline = new Pipeline(); - - pipeline.getProject().setName("PRJ_NAME"); - pipeline.getProject().setReferenceFile(new File("my.fasta")); - pipeline.getProject().setGenotypeDbsnp(new File("my.vcf")); - pipeline.getProject().setEvalDbsnp(new File("my.dbsnp")); - pipeline.getProject().getTags().put("testProjectTag", "project value here"); - - PipelineSample sample = new PipelineSample(); - sample.setId("SMP_ID"); - sample.getBamFiles().put("recalibrated", new File("recalibrated.bam")); - sample.getBamFiles().put("cleaned", new File("/absolute/path/to/cleaned.bam")); - sample.getTags().put("testSampleTag", "sample value here"); - - pipeline.getSamples().add(sample); - - File file = File.createTempFile("testDumpAndLoad", ".yaml"); - YamlUtils.dump(pipeline, file); - Pipeline pipelineLoad = YamlUtils.load(Pipeline.class, file); - - Assert.assertEquals(pipelineLoad.getProject().getName(), pipeline.getProject().getName()); - Assert.assertEquals(pipeline.getProject().getReferenceFile(), pipelineLoad.getProject().getReferenceFile()); - Assert.assertEquals(pipeline.getProject().getIntervalList(), pipelineLoad.getProject().getIntervalList()); - Assert.assertEquals(pipeline.getProject().getGenotypeDbsnp(), pipelineLoad.getProject().getGenotypeDbsnp()); - Assert.assertEquals(pipeline.getProject().getGenotypeDbsnpType(), pipelineLoad.getProject().getGenotypeDbsnpType()); - Assert.assertEquals(pipeline.getProject().getEvalDbsnp(), pipelineLoad.getProject().getEvalDbsnp()); - Assert.assertEquals(pipeline.getProject().getEvalDbsnpType(), pipelineLoad.getProject().getEvalDbsnpType()); - - Assert.assertEquals(pipelineLoad.getProject().getTags().size(), pipeline.getProject().getTags().size()); - for (Map.Entry entry : pipeline.getProject().getTags().entrySet()) - Assert.assertEquals(pipeline.getProject().getTags().get(entry.getKey()), entry.getValue()); - - Assert.assertEquals(pipelineLoad.getSamples().size(), pipeline.getSamples().size()); - for (int i = 0; i < pipeline.getSamples().size(); i++) { - PipelineSample pipelineSample = pipeline.getSamples().get(i); - PipelineSample pipelineLoadSample = pipelineLoad.getSamples().get(i); - - Assert.assertEquals(pipelineLoadSample.getId(), pipelineSample.getId()); - - Assert.assertEquals(pipelineLoadSample.getBamFiles().size(), pipelineSample.getBamFiles().size()); - for (Map.Entry entry : pipelineSample.getBamFiles().entrySet()) - Assert.assertEquals(entry.getValue(), pipelineSample.getBamFiles().get(entry.getKey())); - - Assert.assertEquals(pipelineLoadSample.getTags().size(), pipelineSample.getTags().size()); - for (Map.Entry entry : pipelineSample.getTags().entrySet()) - Assert.assertEquals(pipelineSample.getTags().get(entry.getKey()), entry.getValue()); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index f1f849bf5f..e9f138a0ed 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -2,7 +2,6 @@ import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -11,6 +10,7 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; /** @@ -36,7 +36,6 @@ public void testGetContigIndex() { @Test public void testGetContigIndexValid() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference } @@ -67,7 +66,6 @@ public void testHasContigInfoKnownContig() { @Test public void testGetContigInfoKnownContig() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference } @@ -191,4 +189,104 @@ public void testValidationOfGenomeLocs() { assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end } + + private static class FlankingGenomeLocTestData extends TestDataProvider { + final GenomeLocParser parser; + final int basePairs; + final GenomeLoc original, flankStart, flankStop; + + private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) { + super(FlankingGenomeLocTestData.class, name); + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.flankStart = flankStart == null ? null : parse(parser, flankStart); + this.flankStop = flankStop == null ? null : parse(parser, flankStop); + } + + private static GenomeLoc parse(GenomeLocParser parser, String str) { + return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str); + } + } + + @DataProvider(name = "flankingGenomeLocs") + public Object[][] getFlankingGenomeLocs() { + int contigLength = 10000; + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength); + GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary()); + + new FlankingGenomeLocTestData("atStartBase1", parser, 1, + "chr1:1", null, "chr1:2"); + + new FlankingGenomeLocTestData("atStartBase50", parser, 50, + "chr1:1", null, "chr1:2-51"); + + new FlankingGenomeLocTestData("atStartRange50", parser, 50, + "chr1:1-10", null, "chr1:11-60"); + + new FlankingGenomeLocTestData("atEndBase1", parser, 1, + "chr1:" + contigLength, "chr1:" + (contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndBase50", parser, 50, + "chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 10, contigLength), + String.format("chr1:%d-%d", contigLength - 60, contigLength - 11), + null); + + new FlankingGenomeLocTestData("nearStartBase1", parser, 1, + "chr1:2", "chr1:1", "chr1:3"); + + new FlankingGenomeLocTestData("nearStartRange50", parser, 50, + "chr1:21-30", "chr1:1-20", "chr1:31-80"); + + new FlankingGenomeLocTestData("nearEndBase1", parser, 1, + "chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength); + + new FlankingGenomeLocTestData("nearEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 30, contigLength - 21), + String.format("chr1:%d-%d", contigLength - 80, contigLength - 31), + String.format("chr1:%d-%d", contigLength - 20, contigLength)); + + new FlankingGenomeLocTestData("beyondStartBase1", parser, 1, + "chr1:3", "chr1:2", "chr1:4"); + + new FlankingGenomeLocTestData("beyondStartRange50", parser, 50, + "chr1:101-200", "chr1:51-100", "chr1:201-250"); + + new FlankingGenomeLocTestData("beyondEndBase1", parser, 1, + "chr1:" + (contigLength - 3), + "chr1:" + (contigLength - 4), + "chr1:" + (contigLength - 2)); + + new FlankingGenomeLocTestData("beyondEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 200, contigLength - 101), + String.format("chr1:%d-%d", contigLength - 250, contigLength - 201), + String.format("chr1:%d-%d", contigLength - 100, contigLength - 51)); + + new FlankingGenomeLocTestData("unmapped", parser, 50, + "unmapped", null, null); + + new FlankingGenomeLocTestData("fullContig", parser, 50, + "chr1", null, null); + + return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStart); + assertEquals(actual, data.flankStart, description); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStop); + assertEquals(actual, data.flankStop, description); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 29c085b700..49778a4d8c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -150,4 +151,64 @@ public void testUnmappedMerge() { Assert.assertEquals(twoUnmappedMixed.size(),2,"Wrong number of elements in list."); Assert.assertEquals(twoUnmappedMixed,Arrays.asList(chr1,unmapped),"List sorted in wrong order"); } + + // ------------------------------------------------------------------------------------- + // + // testing overlap detection + // + // ------------------------------------------------------------------------------------- + + private class ReciprocalOverlapProvider extends TestDataProvider { + GenomeLoc gl1, gl2; + int overlapSize; + double overlapFraction; + + private ReciprocalOverlapProvider(int start1, int stop1, int start2, int stop2) { + super(ReciprocalOverlapProvider.class); + gl1 = genomeLocParser.createGenomeLoc("chr1", start1, stop1); + gl2 = genomeLocParser.createGenomeLoc("chr1", start2, stop2); + + int shared = 0; + for ( int i = start1; i <= stop1; i++ ) { + if ( i >= start2 && i <= stop2 ) + shared++; + } + + this.overlapSize = shared; + this.overlapFraction = Math.min((1.0*shared)/gl1.size(), (1.0*shared)/gl2.size()); + super.setName(String.format("%d-%d / %d-%d overlap=%d / %.2f", start1, stop1, start2, stop2, overlapSize, overlapFraction)); + } + } + + @DataProvider(name = "ReciprocalOverlapProvider") + public Object[][] makeReciprocalOverlapProvider() { + for ( int start1 = 1; start1 <= 10; start1++ ) { + for ( int stop1 = start1; stop1 <= 10; stop1++ ) { + new ReciprocalOverlapProvider(start1, stop1, 1, 10); + new ReciprocalOverlapProvider(start1, stop1, 5, 10); + new ReciprocalOverlapProvider(start1, stop1, 5, 7); + new ReciprocalOverlapProvider(start1, stop1, 5, 15); + new ReciprocalOverlapProvider(start1, stop1, 11, 20); + + new ReciprocalOverlapProvider(1, 10, start1, stop1); + new ReciprocalOverlapProvider(5, 10, start1, stop1); + new ReciprocalOverlapProvider(5, 7, start1, stop1); + new ReciprocalOverlapProvider(5, 15, start1, stop1); + new ReciprocalOverlapProvider(11, 20, start1, stop1); + } + } + + return ReciprocalOverlapProvider.getTests(ReciprocalOverlapProvider.class); + } + + @Test(dataProvider = "ReciprocalOverlapProvider") + public void testReciprocalOverlapProvider(ReciprocalOverlapProvider cfg) { + if ( cfg.overlapSize == 0 ) { + Assert.assertFalse(cfg.gl1.overlapsP(cfg.gl2)); + } else { + Assert.assertTrue(cfg.gl1.overlapsP(cfg.gl2)); + Assert.assertEquals(cfg.gl1.intersect(cfg.gl2).size(), cfg.overlapSize); + Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction); + } + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java deleted file mode 100755 index bc39d714e6..0000000000 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.utils; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeTest; -import org.testng.annotations.Test; - - -public class ReadUtilsUnitTest extends BaseTest { - GATKSAMRecord read, reducedRead; - final static String BASES = "ACTG"; - final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; - - @BeforeTest - public void init() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); - read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); - read.setReadBases(new String(BASES).getBytes()); - read.setBaseQualityString(new String(QUALS)); - - reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); - reducedRead.setReadBases(BASES.getBytes()); - reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_QUALITY_TAG, REDUCED_READ_COUNTS); - } - - private void testReadBasesAndQuals(GATKSAMRecord read, int expectedStart, int expectedStop) { - SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null); - String expectedBases = BASES.substring(expectedStart, expectedStop); - String expectedQuals = QUALS.substring(expectedStart, expectedStop); - Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected"); - Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected"); - } - - @Test public void testNoClip() { testReadBasesAndQuals(read, 0, 4); } - @Test public void testClip1Front() { testReadBasesAndQuals(read, 1, 4); } - @Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); } - @Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); } - @Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); } - - @Test - public void testReducedReads() { - Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); - Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); - - Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); - for ( int i = 0; i < reducedRead.getReadLength(); i++) { - Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); - } - } - - @Test - public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read,0); - PileupElement reducedreadp = new PileupElement(reducedRead,0); - - Assert.assertFalse(readp.isReducedRead()); - - Assert.assertTrue(reducedreadp.isReducedRead()); - Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); - Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 3f5d05e66f..7a2696b7b3 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -41,11 +41,6 @@ public void testSimpleTimer() { double t6 = t.getElapsedTime(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); - - t.stop().start(); - Assert.assertTrue(t.isRunning(), "second started timer isn't running"); - Assert.assertTrue(t.getElapsedTime() >= 0.0, "elapsed time should have been reset"); - Assert.assertTrue(t.getElapsedTime() < t6, "elapsed time isn't less than time before start call"); // we should have effective no elapsed time } private final static void idleLoop() { diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java new file mode 100644 index 0000000000..18108e0a10 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -0,0 +1,193 @@ +package org.broadinstitute.sting.utils.clipping; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; + +import java.util.LinkedList; +import java.util.List; +import java.util.Stack; + +/** + * Created by IntelliJ IDEA. + * User: roger + * Date: 11/27/11 + * Time: 6:45 AM + * To change this template use File | Settings | File Templates. + */ +public class ReadClipperTestUtils { + //Should contain all the utils needed for tests to mass produce + //reads, cigars, and other needed classes + + final static byte [] BASES = {'A', 'C', 'T', 'G'}; + final static byte [] QUALS = {2, 15, 25, 30}; + final static String CIGAR = "4M"; + final static CigarElement[] cigarElements = { new CigarElement(1, CigarOperator.HARD_CLIP), + new CigarElement(1, CigarOperator.SOFT_CLIP), + new CigarElement(1, CigarOperator.INSERTION), + new CigarElement(1, CigarOperator.DELETION), + new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH)}; + + + public static GATKSAMRecord makeReadFromCigar(Cigar cigar) { + return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); + } + + /** + * This function generates every valid permutation of cigar strings with a given length. + * + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * + * @param maximumLength the maximum number of elements in the cigar + * @return a list with all valid Cigar objects + */ + public static List generateCigarList(int maximumLength) { + int numCigarElements = cigarElements.length; + LinkedList cigarList = new LinkedList(); + byte [] cigarCombination = new byte[maximumLength]; + + Utils.fillArrayWithByte(cigarCombination, (byte) 0); // we start off with all 0's in the combination array. + int currentIndex = 0; + while (true) { + Cigar cigar = createCigarFromCombination(cigarCombination); // create the cigar + cigar = combineAdjacentCigarElements(cigar); // combine adjacent elements + if (isCigarValid(cigar)) { // check if it's valid + cigarList.add(cigar); // add it + } + + boolean currentIndexChanged = false; + while (currentIndex < maximumLength && cigarCombination[currentIndex] == numCigarElements - 1) { + currentIndex++; // find the next index to increment + currentIndexChanged = true; // keep track of the fact that we have changed indices! + } + + if (currentIndex == maximumLength) // if we hit the end of the array, we're done. + break; + + cigarCombination[currentIndex]++; // otherwise advance the current index + + if (currentIndexChanged) { // if we have changed index, then... + for (int i = 0; i < currentIndex; i++) + cigarCombination[i] = 0; // reset everything from 0->currentIndex + currentIndex = 0; // go back to the first index + } + } + + return cigarList; + } + + private static boolean isCigarValid(Cigar cigar) { + if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation) + + Stack cigarElementStack = new Stack(); // Stack to invert cigar string to find ending operator + CigarOperator startingOp = null; + CigarOperator endingOp = null; + + // check if it doesn't start with deletions + boolean readHasStarted = false; // search the list of elements for the starting operator + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (!readHasStarted) { + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + startingOp = cigarElement.getOperator(); + } + } + cigarElementStack.push(cigarElement); + } + + readHasStarted = false; // search the inverted list of elements (stack) for the stopping operator + while (!cigarElementStack.empty()) { + CigarElement cigarElement = cigarElementStack.pop(); + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + endingOp = cigarElement.getOperator(); + break; + } + } + + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) + } + + return false; + } + + private static Cigar createCigarFromCombination(byte[] cigarCombination) { + Cigar cigar = new Cigar(); + for (byte i : cigarCombination) { + cigar.add(cigarElements[i]); + } + return cigar; + } + + + /** + * Combines equal adjacent elements of a Cigar object + * + * @param rawCigar the cigar object + * @return a combined cigar object + */ + private static Cigar combineAdjacentCigarElements(Cigar rawCigar) { + Cigar combinedCigar = new Cigar(); + CigarElement lastElement = null; + int lastElementLength = 0; + for (CigarElement cigarElement : rawCigar.getCigarElements()) { + if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator()) + lastElementLength += cigarElement.getLength(); + else + { + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + lastElement = cigarElement; + lastElementLength = cigarElement.getLength(); + } + } + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + return combinedCigar; + } + + public static GATKSAMRecord makeRead() { + return ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); + } + + /** + * Asserts that the two reads have the same bases, qualities and cigar strings + * + * @param actual the calculated read + * @param expected the expected read + */ + public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) { + // If they're both not empty, test their contents + if(!actual.isEmpty() && !expected.isEmpty()) { + Assert.assertEquals(actual.getReadBases(), expected.getReadBases()); + Assert.assertEquals(actual.getBaseQualities(), expected.getBaseQualities()); + Assert.assertEquals(actual.getCigarString(), expected.getCigarString()); + } + // Otherwise test if they're both empty + else + Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); + } + + public static Cigar invertCigar (Cigar cigar) { + Stack cigarStack = new Stack(); + for (CigarElement cigarElement : cigar.getCigarElements()) + cigarStack.push(cigarElement); + + Cigar invertedCigar = new Cigar(); + while (!cigarStack.isEmpty()) + invertedCigar.add(cigarStack.pop()); + + return invertedCigar; + } + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java new file mode 100644 index 0000000000..4dad68dc55 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.clipping; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.List; + +/** + * User: roger + * Date: 9/28/11 + */ +public class ReadClipperUnitTest extends BaseTest { + + List cigarList; + int maximumCigarSize = 6; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2 + + @BeforeClass + public void init() { + cigarList = ReadClipperTestUtils.generateCigarList(maximumCigarSize); + } + + @Test(enabled = true) + public void testHardClipBothEndsByReferenceCoordinates() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + int readLength = alnStart - alnEnd; + for (int i=0; i= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + } + } + } + + @Test(enabled = true) + public void testHardClipByReadCoordinates() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int readLength = read.getReadLength(); + for (int i=0; i %s", i, read.getCigarString(), clipLeft.getCigarString())); + + GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength-1); + Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); + } + } + } + + @Test(enabled = true) + public void testHardClipByReferenceCoordinates() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + for (int i=alnStart; i<=alnEnd; i++) { + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); + if (!clipLeft.isEmpty()) + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + } + + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + } + } + } + } + + @Test(enabled = true) + public void testHardClipByReferenceCoordinatesLeftTail() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + for (int i=alnStart; i<=alnEnd; i++) { + GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); + if (!clipLeft.isEmpty()) + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + } + } + } + } + + @Test(enabled = true) + public void testHardClipByReferenceCoordinatesRightTail() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + for (int i=alnStart; i<=alnEnd; i++) { + GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + } + } + } + } + + @Test(enabled = true) + public void testHardClipLowQualEnds() { + final byte LOW_QUAL = 2; + final byte HIGH_QUAL = 30; + + // create a read for every cigar permutation + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + int readLength = read.getReadLength(); + byte [] quals = new byte[readLength]; + + for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { + + // create a read with nLowQualBases in the left tail + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) + quals[addLeft] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + assertNoLowQualBases(clipLeft, LOW_QUAL); + + // Can't run this test with the current contract of no hanging insertions +// Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); + + // create a read with nLowQualBases in the right tail + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addRight = 0; addRight < nLowQualBases; addRight++) + quals[readLength - addRight - 1] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + assertNoLowQualBases(clipRight, LOW_QUAL); + + // Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); + + // create a read with nLowQualBases in the both tails + if (nLowQualBases <= readLength/2) { + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { + quals[addBoth] = LOW_QUAL; + quals[readLength - addBoth - 1] = LOW_QUAL; + } + read.setBaseQualities(quals); + GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + assertNoLowQualBases(clipBoth, LOW_QUAL); + + // Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); + } + } +// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString())); + } + + // ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug ) + final byte[] BASES = {'A','C','G','T','A','C','G','T'}; + final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; + final String CIGAR = "1S1M5I1S"; + + final byte[] CLIPPED_BASES = {}; + final byte[] CLIPPED_QUALS = {}; + final String CLIPPED_CIGAR = ""; + + + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); + GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR); + + ReadClipperTestUtils.assertEqualReads(ReadClipper.hardClipLowQualEnds(read, (byte) 2), expected); + } + + @Test(enabled = true) + public void testHardClipSoftClippedBases() { + + // Generate a list of cigars to test + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read); + + int sumHardClips = 0; + int sumMatches = 0; + + boolean tail = true; + for (CigarElement element : read.getCigar().getCigarElements()) { + // Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right) + if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP) + tail = true; + + // Adds all H, S and D's (next to hard/soft clips). + // All these should be hard clips after clipping. + if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION)) + sumHardClips += element.getLength(); + + // this means we're no longer on the tail (insertions can still potentially be the tail because + // of the current contract of clipping out hanging insertions + else if (element.getOperator() != CigarOperator.INSERTION) + tail = false; + + // Adds all matches to verify that they remain the same after clipping + if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + sumMatches += element.getLength(); + } + + for (CigarElement element : clippedRead.getCigar().getCigarElements()) { + // Test if clipped read has Soft Clips (shouldn't have any!) + Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString())); + + // Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for + if (element.getOperator() == CigarOperator.HARD_CLIP) + sumHardClips -= element.getLength(); + + // Make sure all matches are still there + if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + sumMatches -= element.getLength(); + } + Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips)); + Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); + + +// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); + } + } + + @Test(enabled = false) + public void testHardClipLeadingInsertions() { + for (Cigar cigar : cigarList) { + if (startsWithInsertion(cigar)) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read); + + int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); + if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) + expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); + + if (! clippedRead.isEmpty()) { + Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there + Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone + } + else + Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped + } + } + } + + @Test(enabled = true) + public void testRevertSoftClippedBases() + { + for (Cigar cigar: cigarList) { + final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); + + if ( leadingSoftClips > 0 || tailSoftClips > 0) { + final int expectedStart = read.getAlignmentStart() - leadingSoftClips; + final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; + + Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); + Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); + } + else + Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); + } + } + + + private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { + if (!read.isEmpty()) { + byte [] quals = read.getBaseQualities(); + for (int i=0; i 0; + } + + private int leadingCigarElementLength(Cigar cigar, CigarOperator operator) { + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (cigarElement.getOperator() == operator) + return cigarElement.getLength(); + if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) + break; + } + return 0; + } + + private boolean cigarHasElementsDifferentThanInsertionsAndHardClips (Cigar cigar) { + for (CigarElement cigarElement : cigar.getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java deleted file mode 100644 index f625af23ce..0000000000 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.clipreads; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 9/28/11 - * Time: 9:54 PM - * To change this template use File | Settings | File Templates. - */ -public class ReadClipperUnitTest extends BaseTest { - - // TODO: Add error messages on failed tests - - GATKSAMRecord read, expected; - ReadClipper readClipper; - final static String BASES = "ACTG"; - final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 - - @BeforeClass - public void init() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); - read.setReadBases(new String(BASES).getBytes()); - read.setBaseQualityString(new String(QUALS)); - - readClipper = new ReadClipper(read); - } - - @Test ( enabled = false ) - public void testHardClipBothEndsByReferenceCoordinates() { - logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new GATKSAMRecord(read.getHeader())); - //clip 1 base - expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3)); - Assert.assertEquals(expected.getCigarString(), "1H2M1H"); - - } - - @Test ( enabled = false ) - public void testHardClipByReadCoordinates() { - logger.warn("Executing testHardClipByReadCoordinates"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader())); - - //clip 1 base at start - expected = readClipper.hardClipByReadCoordinates(0,0); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); - - //clip 1 base at end - expected = readClipper.hardClipByReadCoordinates(3,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReadCoordinates(0,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReadCoordinates(2,3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); - - } - - @Test ( enabled = false ) - public void testHardClipByReferenceCoordinates() { - logger.warn("Executing testHardClipByReferenceCoordinates"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new GATKSAMRecord(read.getHeader())); - - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinates(-1,1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); - - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinates(3,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinates(-1,2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinates(2,-1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); - - } - - @Test ( enabled = false ) - public void testHardClipByReferenceCoordinatesLeftTail() { - logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader())); - - //clip 1 base at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); - - //clip 2 bases at start - expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - } - - @Test ( enabled = false ) - public void testHardClipByReferenceCoordinatesRightTail() { - logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader())); - - //clip 1 base at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(3); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at end - expected = readClipper.hardClipByReferenceCoordinatesRightTail(2); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); - - } - - @Test ( enabled = false ) - public void testHardClipLowQualEnds() { - logger.warn("Executing testHardClipByReferenceCoordinates"); - - - //Clip whole read - Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new GATKSAMRecord(read.getHeader())); - - //clip 1 base at start - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); - Assert.assertEquals(expected.getCigarString(), "1H3M"); - - //clip 2 bases at start - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); - Assert.assertEquals(expected.getCigarString(), "2H2M"); - - // Reverse Quals sequence - readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 - - //clip 1 base at end - expected = readClipper.hardClipLowQualEnds((byte)34); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); - Assert.assertEquals(expected.getCigarString(), "3M1H"); - - //clip 2 bases at end - expected = readClipper.hardClipLowQualEnds((byte)44); - Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); - Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); - Assert.assertEquals(expected.getCigarString(), "2M2H"); - - // revert Qual sequence - readClipper.getRead().setBaseQualityString(QUALS); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 35c6a49932..96a33b7381 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -2,9 +2,7 @@ import org.broad.tribble.Tribble; import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; @@ -120,24 +118,23 @@ private VariantContext createVC(VCFHeader header) { GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1",1); List alleles = new ArrayList(); Set filters = null; - Map attributes = new HashMap(); - Map genotypes = new HashMap(); + Map attributes = new HashMap(); + GenotypesContext genotypes = GenotypesContext.create(header.getGenotypeSamples().size()); alleles.add(Allele.create("-",true)); alleles.add(Allele.create("CC",false)); attributes.put("DP","50"); for (String name : header.getGenotypeSamples()) { - Map gtattributes = new HashMap(); + Map gtattributes = new HashMap(); gtattributes.put("BB","1"); Genotype gt = new Genotype(name,alleles.subList(1,2),0,null,gtattributes,true); - genotypes.put(name,gt); + genotypes.add(gt); } - return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes, (byte)'A'); - - + return new VariantContextBuilder("RANDOM", loc.getContig(), loc.getStart(), loc.getStop(), alleles) + .genotypes(genotypes).attributes(attributes).referenceBaseForIndel((byte)'A').make(); } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 75bdc3142a..3fb3308532 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -162,6 +162,20 @@ public void testMixedIntervalMerging() { executeTest("testMixedIntervalMerging", spec); } + @Test(enabled = true) + public void testBed() { + String md5 = "cf4278314ef8e4b996e1b798d8eb92cf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.bed", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testBed", spec); + } + @Test(enabled = true) public void testComplexVCF() { String md5 = "166d77ac1b46a1ec38aa35ab7e628ab5"; diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 9c3b905c25..a9035ffd92 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,9 +1,12 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; +import org.apache.commons.io.FileUtils; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.IntervalBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.testng.Assert; @@ -762,4 +765,235 @@ public void testUnmergedIntervals(String unmergedIntervals) { List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); Assert.assertEquals(merged.size(), 1); } + + /* + Split into tests that can be written to files and tested by writeFlankingIntervals, + and lists that cannot but are still handled by getFlankingIntervals. + */ + private static abstract class FlankingIntervalsTestData extends TestDataProvider { + final public File referenceFile; + final public GenomeLocParser parser; + final int basePairs; + final List original; + final List expected; + + protected FlankingIntervalsTestData(Class clazz, String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(clazz, name); + this.referenceFile = referenceFile; + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.expected = parse(parser, expected); + } + + private static List parse(GenomeLocParser parser, List locs) { + List parsed = new ArrayList(); + for (String loc: locs) + parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc)); + return parsed; + } + } + + private static class FlankingIntervalsFile extends FlankingIntervalsTestData { + public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + private static class FlankingIntervalsList extends FlankingIntervalsTestData { + public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + /* Intervals where the original and the flanks can be written to files. */ + @DataProvider(name = "flankingIntervalsFiles") + public Object[][] getFlankingIntervalsFiles() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength(); + + new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:1"), + Arrays.asList("1:2")); + + new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1"), + Arrays.asList("1:2-51")); + + new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1-10"), + Arrays.asList("1:11-60")); + + new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + hg19Length1), + Arrays.asList("1:" + (hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:" + hg19Length1), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11))); + + new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:2"), + Arrays.asList("1:1", "1:3")); + + new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:21-30"), + Arrays.asList("1:1-20", "1:31-80")); + + new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 1)), + Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1)); + + new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31), + String.format("1:%d-%d", hg19Length1 - 20, hg19Length1))); + + new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:3"), + Arrays.asList("1:2", "1:4")); + + new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200"), + Arrays.asList("1:51-100", "1:201-250")); + + new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 3)), + Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2))); + + new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201), + String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51))); + + new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:401-500"), + Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550")); + + new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:271-400"), + Arrays.asList("1:51-100", "1:201-270", "1:401-450")); + + new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:221-400"), + Arrays.asList("1:51-100", "1:201-220", "1:401-450")); + + new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:121-400"), + Arrays.asList("1:51-100", "1:401-450")); + + new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "2:301-400"), + Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450")); + + // Explicit testing a problematic agilent target pair + new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("2:74756257-74756411", "2:74756487-74756628"), + // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679") + Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678")); + + return TestDataProvider.getTests(FlankingIntervalsFile.class); + } + + /* Intervals where either the original and/or the flanks cannot be written to a file. */ + @DataProvider(name = "flankingIntervalsLists") + public Object[][] getFlankingIntervalsLists() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + List empty = Collections.emptyList(); + + new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50, + empty, + empty); + + new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("unmapped"), + empty); + + new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1"), + empty); + + new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1", "2", "3"), + empty); + + new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400", "unmapped"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + return TestDataProvider.getTests(FlankingIntervalsList.class); + } + + @Test(dataProvider = "flankingIntervalsFiles") + public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + + List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath()); + + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class) + public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + // Should throw a user exception on bad input if either the original + // intervals are empty or if the flanking intervals are empty + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists") + public void testGetFlankingIntervals(FlankingIntervalsTestData data) { + List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testExceptionUponLegacyIntervalSyntax() throws Exception { + GenomeAnalysisEngine toolkit = new GenomeAnalysisEngine(); + toolkit.setGenomeLocParser(new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)))); + + // Attempting to use the legacy -L "interval1;interval2" syntax should produce an exception: + IntervalBinding binding = new IntervalBinding("1;2"); + List intervals = binding.getIntervals(toolkit); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java new file mode 100755 index 0000000000..b9f831028b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -0,0 +1,112 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + + +public class ReadUtilsUnitTest extends BaseTest { + GATKSAMRecord read, reducedRead; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; + final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; + final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets + + @BeforeTest + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + + reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); + reducedRead.setReadBases(BASES.getBytes()); + reducedRead.setBaseQualityString(QUALS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); + } + + @Test + public void testReducedReads() { + Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); + Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); + + Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); + for (int i = 0; i < reducedRead.getReadLength(); i++) { + Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); + } + } + + @Test + public void testReducedReadPileupElement() { + PileupElement readp = new PileupElement(read, 0); + PileupElement reducedreadp = new PileupElement(reducedRead, 0); + + Assert.assertFalse(readp.isReducedRead()); + + Assert.assertTrue(reducedreadp.isReducedRead()); + Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); + Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); + } + + @Test + public void testGetAdaptorBoundary() { + final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; + final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30}; + final String cigar = "8M"; + final int fragmentSize = 10; + final int mateStart = 1000; + final int BEFORE = mateStart - 2; + final int AFTER = mateStart + 2; + Integer myStart, boundary; + + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); + read.setMateAlignmentStart(mateStart); + read.setInferredInsertSize(fragmentSize); + + // Test case 1: positive strand, first read + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1); + + // Test case 2: positive strand, second read + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1); + + // Test case 3: negative strand, second read + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary.intValue(), mateStart - 1); + + // Test case 4: negative strand, first read + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary.intValue(), mateStart - 1); + + // Test case 5: mate is mapped to another chromosome (test both strands) + read.setInferredInsertSize(0); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertNull(boundary); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertNull(boundary); + + // Test case 6: read is unmapped + read.setReadUnmappedFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertNull(boundary); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 9243588ab2..a66c78f3c4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,10 +29,13 @@ // the imports for unit testing. +import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import java.util.EnumMap; + /** * Basic unit test for Genotype likelihoods objects @@ -69,6 +72,62 @@ public void testErrorBadFormat() { gl.getAsVector(); } + @Test + public void testGetAsMap(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(v); + //Log scale + EnumMap glMap = gl.getAsMap(false); + Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Linear scale + glMap = gl.getAsMap(true); + double [] vl = MathUtils.normalizeFromLog10(v); + Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + glMap = gl.getAsMap(false); + Assert.assertNull(glMap); + + } + + @Test + public void testGetLog10GQ(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString); + + //GQ for the best guess genotype + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),-3.9); + + double[] test = MathUtils.normalizeFromLog10(gl.getAsVector()); + + //GQ for the other genotypes + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF), Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR), Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY); + + } + + @Test + public void testgetQualFromLikelihoods(){ + double[] likelihoods = new double[]{-1, 0, -2}; + // qual values we expect for each possible "best" genotype + double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294}; + + for ( int i = 0; i < likelihoods.length; i++ ) { + Assert.assertEquals(GenotypeLikelihoods.getQualFromLikelihoods(i, likelihoods), expectedQuals[i], 1e-6, + "GQ value for genotype " + i + " was not calculated correctly"); + } + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java index c4f1efd041..e0a037105d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java @@ -71,8 +71,8 @@ public void before() { // public boolean sameGenotype(Genotype other) // public boolean sameGenotype(Genotype other, boolean ignorePhase) // public String getSampleName() -// public boolean hasNegLog10PError() -// public double getNegLog10PError() +// public boolean hasLog10PError() +// public double getLog10PError() // public double getPhredScaledQual() // public boolean hasAttribute(String key) // public Object getAttribute(String key) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java new file mode 100644 index 0000000000..ee0a5dfe03 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils.variantcontext; + + +// the imports for unit testing. + + +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class GenotypesContextUnitTest extends BaseTest { + Allele Aref, C, T; + Genotype AA, AT, TT, AC, CT, CC, MISSING; + List allGenotypes; + + @BeforeSuite + public void before() { + C = Allele.create("C"); + Aref = Allele.create("A", true); + T = Allele.create("T"); + AA = new Genotype("AA", Arrays.asList(Aref, Aref)); + AT = new Genotype("AT", Arrays.asList(Aref, T)); + TT = new Genotype("TT", Arrays.asList(T, T)); + AC = new Genotype("AC", Arrays.asList(Aref, C)); + CT = new Genotype("CT", Arrays.asList(C, T)); + CC = new Genotype("CC", Arrays.asList(C, C)); + MISSING = new Genotype("MISSING", Arrays.asList(C, C)); + + allGenotypes = Arrays.asList(AA, AT, TT, AC, CT, CC); + } + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private interface ContextMaker { + public GenotypesContext make(List initialSamples); + } + + private ContextMaker baseMaker = new ContextMaker() { + @Override + public GenotypesContext make(final List initialSamples) { + return GenotypesContext.copy(initialSamples); + } + + @Override + public String toString() { + return "GenotypesContext"; + } + }; + + private final class lazyMaker implements LazyGenotypesContext.LazyParser, ContextMaker { + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + GenotypesContext gc = GenotypesContext.copy((List)data); + gc.ensureSampleNameMap(); + gc.ensureSampleOrdering(); + return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset); + } + + @Override + public GenotypesContext make(final List initialSamples) { + return new LazyGenotypesContext(this, initialSamples, initialSamples.size()); + } + + @Override + public String toString() { + return "LazyGenotypesContext"; + } + } + + private Collection allMakers = Arrays.asList(baseMaker, new lazyMaker()); + + private class GenotypesContextProvider extends TestDataProvider { + ContextMaker maker; + final List initialSamples; + + private GenotypesContextProvider(ContextMaker maker, List initialSamples) { + super(GenotypesContextProvider.class, String.format("%s with %d samples", maker.toString(), initialSamples.size())); + this.maker = maker; + this.initialSamples = initialSamples; + } + + public GenotypesContext makeContext() { + return maker.make(initialSamples); + } + } + + @DataProvider(name = "GenotypesContextProvider") + public Object[][] MakeSampleNamesTest() { + for ( ContextMaker maker : allMakers ) { + for ( int i = 0; i < allGenotypes.size(); i++ ) { + List samples = allGenotypes.subList(0, i); + // sorted + new GenotypesContextProvider(maker, samples); + // unsorted + new GenotypesContextProvider(maker, Utils.reverse(samples)); + } + } + + return GenotypesContextProvider.getTests(GenotypesContextProvider.class); + } + + private final static void testIterable(Iterable genotypeIterable, Set expectedNames) { + int count = 0; + for ( final Genotype g : genotypeIterable ) { + Assert.assertTrue(expectedNames.contains(g.getSampleName())); + count++; + } + Assert.assertEquals(count, expectedNames.size(), "Iterable returned unexpected number of genotypes"); + } + + @Test(dataProvider = "GenotypesContextProvider") + public void testInitialSamplesAreAsExpected(GenotypesContextProvider cfg) { + testGenotypesContextContainsExpectedSamples(cfg.makeContext(), cfg.initialSamples); + } + + private final void testGenotypesContextContainsExpectedSamples(GenotypesContext gc, List expectedSamples) { + Assert.assertEquals(gc.isEmpty(), expectedSamples.isEmpty()); + Assert.assertEquals(gc.size(), expectedSamples.size()); + + // get(index) is doing the right thing + for ( int i = 0; i < expectedSamples.size(); i++ ) { + Assert.assertEquals(gc.get(i), expectedSamples.get(i)); + } + Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); + + // we can fetch samples by name + final Set genotypeNames = VariantContextUtils.genotypeNames(expectedSamples); + for ( final String name : genotypeNames ) { + Assert.assertTrue(gc.containsSample(name)); + } + Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); + + // all of the iterators are working + testIterable(gc.iterateInSampleNameOrder(), genotypeNames); + testIterable(gc, genotypeNames); + testIterable(gc.iterateInSampleNameOrder(genotypeNames), genotypeNames); + if ( ! genotypeNames.isEmpty() ) { + Set first = Collections.singleton(genotypeNames.iterator().next()); + testIterable(gc.iterateInSampleNameOrder(first), first); + } + + // misc. utils are working as expected + Assert.assertEquals(gc.getSampleNames(), genotypeNames); + Assert.assertTrue(ParsingUtils.isSorted(gc.getSampleNamesOrderedByName())); + Assert.assertTrue(ParsingUtils.isSorted(gc.iterateInSampleNameOrder())); + Assert.assertTrue(gc.containsSamples(genotypeNames)); + + final Set withMissing = new HashSet(Arrays.asList(MISSING.getSampleName())); + withMissing.addAll(genotypeNames); + Assert.assertFalse(gc.containsSamples(withMissing)); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testImmutable(GenotypesContextProvider cfg) { + GenotypesContext gc = cfg.makeContext(); + Assert.assertEquals(gc.isMutable(), true); + gc.immutable(); + Assert.assertEquals(gc.isMutable(), false); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class ) + public void testImmutableCall1(GenotypesContextProvider cfg) { + GenotypesContext gc = cfg.makeContext(); + gc.immutable(); + gc.add(MISSING); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testClear(GenotypesContextProvider cfg) { + GenotypesContext gc = cfg.makeContext(); + gc.clear(); + testGenotypesContextContainsExpectedSamples(gc, Collections.emptyList()); + } + + private static final List with(List genotypes, Genotype ... add) { + List l = new ArrayList(genotypes); + l.addAll(Arrays.asList(add)); + return l; + } + + private static final List without(List genotypes, Genotype ... remove) { + List l = new ArrayList(genotypes); + l.removeAll(Arrays.asList(remove)); + return l; + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testAdds(GenotypesContextProvider cfg) { + Genotype add1 = new Genotype("add1", Arrays.asList(Aref, Aref)); + Genotype add2 = new Genotype("add2", Arrays.asList(Aref, Aref)); + + GenotypesContext gc = cfg.makeContext(); + gc.add(add1); + testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1)); + + gc = cfg.makeContext(); + gc.add(add1); + gc.add(add2); + testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); + + gc = cfg.makeContext(); + gc.addAll(Arrays.asList(add1, add2)); + testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testRemoves(GenotypesContextProvider cfg) { + Genotype rm1 = AA; + Genotype rm2 = AC; + + GenotypesContext gc = cfg.makeContext(); + if (gc.size() > 1) { + Genotype rm = gc.get(0); + gc.remove(rm); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm)); + } + + gc = cfg.makeContext(); + gc.remove(rm1); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1)); + + gc = cfg.makeContext(); + gc.remove(rm1); + gc.remove(rm2); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2)); + + gc = cfg.makeContext(); + gc.removeAll(Arrays.asList(rm1, rm2)); + testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2)); + + gc = cfg.makeContext(); + HashSet expected = new HashSet(); + if ( gc.contains(rm1) ) expected.add(rm1); + if ( gc.contains(rm2) ) expected.add(rm2); + gc.retainAll(Arrays.asList(rm1, rm2)); + + // ensure that the two lists are the same + Assert.assertEquals(new HashSet(gc.getGenotypes()), expected); + // because the list order can change, we use the gc's list itself + testGenotypesContextContainsExpectedSamples(gc, gc.getGenotypes()); + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testSet(GenotypesContextProvider cfg) { + Genotype set = new Genotype("replace", Arrays.asList(Aref, Aref)); + int n = cfg.makeContext().size(); + for ( int i = 0; i < n; i++ ) { + GenotypesContext gc = cfg.makeContext(); + Genotype setted = gc.set(i, set); + Assert.assertNotNull(setted); + ArrayList l = new ArrayList(cfg.initialSamples); + l.set(i, set); + testGenotypesContextContainsExpectedSamples(gc, l); + } + } + + @Test(enabled = true, dataProvider = "GenotypesContextProvider") + public void testReplace(GenotypesContextProvider cfg) { + int n = cfg.makeContext().size(); + for ( int i = 0; i < n; i++ ) { + GenotypesContext gc = cfg.makeContext(); + Genotype toReplace = gc.get(i); + Genotype replacement = new Genotype(toReplace.getSampleName(), Arrays.asList(Aref, Aref)); + gc.replace(replacement); + ArrayList l = new ArrayList(cfg.initialSamples); + l.set(i, replacement); + Assert.assertEquals(replacement, gc.get(i)); + testGenotypesContextContainsExpectedSamples(gc, l); + } + } + + // subset to samples tested in VariantContextUnitTest +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java new file mode 100644 index 0000000000..a71949369b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import com.google.caliper.runner.CaliperMain; +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.readers.AsciiLineReader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.*; +import java.util.*; + +/** + * Caliper microbenchmark of parsing a VCF file + */ +public class VariantContextBenchmark extends SimpleBenchmark { + @Param({"/Users/depristo/Desktop/broadLocal/localData/ALL.chr20.merged_beagle_mach.20101123.snps_indels_svs.genotypes.vcf"}) + String vcfFile; + + @Param({"1000"}) + int linesToRead; // set automatically by framework + + @Param({"100"}) + int nSamplesToTake; // set automatically by framework + + @Param({"10"}) + int dupsToMerge; // set automatically by framework + + @Param + Operation operation; // set automatically by framework + + private String INPUT_STRING; + + public enum Operation { + READ, + SUBSET_TO_SAMPLES, + GET_TYPE, + GET_ID, + GET_GENOTYPES, + GET_ATTRIBUTE_STRING, + GET_ATTRIBUTE_INT, + GET_N_SAMPLES, + GET_GENOTYPES_FOR_SAMPLES, + GET_GENOTYPES_IN_ORDER_OF_NAME, + CALC_GENOTYPE_COUNTS, + MERGE + } + + private GenomeLocParser b37GenomeLocParser; + + @Override protected void setUp() { + try { + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.b37KGReference)); + b37GenomeLocParser = new GenomeLocParser(seq); + } catch ( FileNotFoundException e) { + throw new RuntimeException(e); + } + + // read it into a String so that we don't try to benchmark IO issues + try { + FileInputStream s = new FileInputStream(new File(vcfFile)); + AsciiLineReader lineReader = new AsciiLineReader(s); + int counter = 0; + StringBuffer sb = new StringBuffer(); + while (counter++ < linesToRead ) { + String line = lineReader.readLine(); + if ( line == null ) + break; + sb.append(line + "\n"); + } + s.close(); + INPUT_STRING = sb.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private interface FunctionToBenchmark { + public void run(T vc); + } + + private void runBenchmark(FeatureCodec codec, FunctionToBenchmark func) { + try { + InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes()); + AsciiLineReader lineReader = new AsciiLineReader(is); + codec.readHeader(lineReader); + + int counter = 0; + while (counter++ < linesToRead ) { + String line = lineReader.readLine(); + if ( line == null ) + break; + + T vc = codec.decode(line); + func.run(vc); + } + } catch (Exception e) { + System.out.println("Benchmarking run failure because of " + e.getMessage()); + } + } + + public void timeV14(int rep) { + for ( int i = 0; i < rep; i++ ) { + FunctionToBenchmark func = getV14FunctionToBenchmark(); + FeatureCodec codec = new VCFCodec(); + runBenchmark(codec, func); + } + } + + public FunctionToBenchmark getV14FunctionToBenchmark() { + switch ( operation ) { + case READ: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // empty operation + } + }; + case SUBSET_TO_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + VariantContext sub = vc.subContextFromSamples(samples); + sub.getNSamples(); + } + }; + case GET_TYPE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getType(); + } + }; + case GET_ID: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getID(); + } + }; + case GET_GENOTYPES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getGenotypes().size(); + } + }; + + case GET_GENOTYPES_FOR_SAMPLES: + return new FunctionToBenchmark() { + Set samples; + public void run(final VariantContext vc) { + if ( samples == null ) + samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + vc.getGenotypes(samples).size(); + } + }; + + case GET_ATTRIBUTE_STRING: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttribute("AN", null); + } + }; + + case GET_ATTRIBUTE_INT: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getAttributeAsInt("AC", 0); + } + }; + + case GET_N_SAMPLES: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getNSamples(); + } + }; + + case GET_GENOTYPES_IN_ORDER_OF_NAME: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + ; // TODO - TEST IS BROKEN +// int n = 0; +// for ( final Genotype g: vc.getGenotypesOrderedByName() ) n++; + } + }; + + case CALC_GENOTYPE_COUNTS: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + vc.getHetCount(); + } + }; + + case MERGE: + return new FunctionToBenchmark() { + public void run(final VariantContext vc) { + List toMerge = new ArrayList(); + + for ( int i = 0; i < dupsToMerge; i++ ) { + GenotypesContext gc = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + gc.add(new Genotype(g.getSampleName()+"_"+i, g)); + } + toMerge.add(new VariantContextBuilder(vc).genotypes(gc).make()); + } + + VariantContextUtils.simpleMerge(b37GenomeLocParser, toMerge, null, + VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + VariantContextUtils.GenotypeMergeType.UNSORTED, + true, false, "set", false, true); + } + }; + + default: throw new IllegalArgumentException("Unexpected operation " + operation); + } + } + + // -------------------------------------------------------------------------------- + // + // V13 + // + // In order to use this, you must move the v13 version from archive and uncomment + // + // git mv private/archive/java/src/org/broadinstitute/sting/utils/variantcontext/v13 public/java/test/org/broadinstitute/sting/utils/variantcontext/v13 + // + // -------------------------------------------------------------------------------- + +// public void timeV13(int rep) { +// for ( int i = 0; i < rep; i++ ) { +// FunctionToBenchmark func = getV13FunctionToBenchmark(); +// FeatureCodec codec = new org.broadinstitute.sting.utils.variantcontext.v13.VCFCodec(); +// runBenchmark(codec, func); +// } +// } +// +// public FunctionToBenchmark getV13FunctionToBenchmark() { +// switch ( operation ) { +// case READ: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// ; // empty operation +// } +// }; +// case SUBSET_TO_SAMPLES: +// return new FunctionToBenchmark() { +// List samples; +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake); +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values()); +// sub.getNSamples(); +// } +// }; +// +// case GET_TYPE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getType(); +// } +// }; +// case GET_ID: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getID(); +// } +// }; +// case GET_GENOTYPES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getGenotypes().size(); +// } +// }; +// +// case GET_GENOTYPES_FOR_SAMPLES: +// return new FunctionToBenchmark() { +// Set samples; +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// if ( samples == null ) +// samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); +// vc.getGenotypes(samples).size(); +// } +// }; +// +// case GET_ATTRIBUTE_STRING: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getAttribute("AN", null); +// } +// }; +// +// case GET_ATTRIBUTE_INT: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getAttributeAsInt("AC", 0); +// } +// }; +// +// case GET_N_SAMPLES: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getNSamples(); +// } +// }; +// +// case GET_GENOTYPES_IN_ORDER_OF_NAME: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// ; // TODO - TEST IS BROKEN +// //vc.getGenotypesOrderedByName(); +// } +// }; +// +// case CALC_GENOTYPE_COUNTS: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// vc.getHetCount(); +// } +// }; +// +// case MERGE: +// return new FunctionToBenchmark() { +// public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) { +// List toMerge = new ArrayList(); +// +// for ( int i = 0; i < dupsToMerge; i++ ) { +// Map gc = new HashMap(); +// for ( final org.broadinstitute.sting.utils.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) { +// String name = g.getSampleName()+"_"+i; +// gc.put(name, new org.broadinstitute.sting.utils.variantcontext.v13.Genotype(name, +// g.getAlleles(), g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector())); +// toMerge.add(org.broadinstitute.sting.utils.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc)); +// } +// } +// +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser, +// toMerge, null, +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, +// true, false, "set", false, true); +// } +// }; +// +// default: throw new IllegalArgumentException("Unexpected operation " + operation); +// } +// } + + public static void main(String[] args) { + CaliperMain.main(VariantContextBenchmark.class, args); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java deleted file mode 100755 index 67fe7d012f..0000000000 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java +++ /dev/null @@ -1,65 +0,0 @@ - - -package org.broadinstitute.sting.utils.variantcontext; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.HashMap; -import java.util.Map; -import java.util.Arrays; - -public class VariantContextIntegrationTest extends WalkerTest { - private static String cmdRoot = "-T TestVariantContext" + - " -R " + b36KGReference; - - private static String root = cmdRoot + - " -L 1:1-1,000,000 -V " + b36dbSNP129; - - private static final class VCITTest extends TestDataProvider { - String args, md5; - - private VCITTest(final String args, final String md5) { - super(VCITTest.class); - this.args = args; - this.md5 = md5; - } - } - - @DataProvider(name = "VCITTestData") - public Object[][] createVCITTestData() { - new VCITTest("--printPerLocus", "e9d0f1fe80659bb55b40aa6c3a2e921e"); - new VCITTest("--printPerLocus --onlyContextsOfType SNP", "0e620db3e45771df42c54a9c0ae4a29f"); - new VCITTest("--printPerLocus --onlyContextsOfType INDEL", "b725c204fefe3814644d50e7c20f9dfe"); - new VCITTest("--printPerLocus --onlyContextsOfType MIXED", "3ccc33f496a1718df55722d11cc14334"); - new VCITTest("--printPerLocus --onlyContextsOfType NO_VARIATION", "39335acdb34c8a2af433dc50d619bcbc"); - new VCITTest("--printPerLocus --takeFirstOnly", "3a45561da042b2b44b6a679744f16103"); - new VCITTest("--printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "4746f269ecc377103f83eb61cc162c39"); - new VCITTest("--printPerLocus --onlyContextsStartinAtCurrentPosition", "2749e3fae458650a85a2317e346dc44c"); - new VCITTest("--printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "9bd48c2a40813023e29ffaa23d59d382"); - - return VCITTest.getTests(VCITTest.class); - } - - @Test(dataProvider = "VCITTestData") - public void testConversionSelection(VCITTest test) { - String extraArgs = test.args; - String md5 = test.md5; - - WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testSelectors", spec); - } - - @Test - public void testToVCF() { - // this really just tests that we are seeing the same number of objects over all of chr1 - - WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -NO_HEADER -V:VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s", - 2, // just one output file - Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "ff91731213fd0bbdc200ab6fd1c93e63")); - executeTest("testToVCF", spec); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index a4d78b6377..0e75eee143 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -6,15 +6,16 @@ import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.testng.annotations.BeforeSuite; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.lang.reflect.Array; +import java.util.*; public class VariantContextUnitTest extends BaseTest { @@ -41,6 +42,8 @@ public class VariantContextUnitTest extends BaseTest { int mixedLocStart = 20; int mixedLocStop = 23; + VariantContextBuilder basicBuilder, snpBuilder, insBuilder; + @BeforeSuite public void before() { del = Allele.create("-"); @@ -56,6 +59,13 @@ public void before() { ATCref = Allele.create("ATC", true); } + @BeforeMethod + public void beforeTest() { + basicBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A'); + snpBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)).referenceBaseForIndel((byte)'A'); + insBuilder = new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATC)).referenceBaseForIndel((byte)'A'); + } + @Test public void testDetermineTypes() { Allele ACref = Allele.create("AC", true); @@ -70,68 +80,68 @@ public void testDetermineTypes() { // test REF List alleles = Arrays.asList(Tref); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + VariantContext vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION); // test SNPs alleles = Arrays.asList(Tref, A); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); alleles = Arrays.asList(Tref, A, C); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); // test MNPs alleles = Arrays.asList(ACref, TA); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = snpBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MNP); alleles = Arrays.asList(ATCref, CAT, Allele.create("GGG")); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MNP); // test INDELs alleles = Arrays.asList(Aref, ATC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(Tref, TA, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC")); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); // test MIXED alleles = Arrays.asList(TAref, T, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(TAref, T, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(ACref, ATC, AT); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(Aref, T, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); // test SYMBOLIC alleles = Arrays.asList(Tref, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC); } @@ -139,8 +149,8 @@ public void testDetermineTypes() { public void testMultipleSNPAlleleOrdering() { final List allelesNaturalOrder = Arrays.asList(Aref, C, T); final List allelesUnnaturalOrder = Arrays.asList(Aref, T, C); - VariantContext naturalVC = new VariantContext("natural", snpLoc, snpLocStart, snpLocStop, allelesNaturalOrder); - VariantContext unnaturalVC = new VariantContext("unnatural", snpLoc, snpLocStart, snpLocStop, allelesUnnaturalOrder); + VariantContext naturalVC = snpBuilder.alleles(allelesNaturalOrder).make(); + VariantContext unnaturalVC = snpBuilder.alleles(allelesUnnaturalOrder).make(); Assert.assertEquals(new ArrayList(naturalVC.getAlleles()), allelesNaturalOrder); Assert.assertEquals(new ArrayList(unnaturalVC.getAlleles()), allelesUnnaturalOrder); } @@ -149,7 +159,7 @@ public void testMultipleSNPAlleleOrdering() { public void testCreatingSNPVariantContext() { List alleles = Arrays.asList(Aref, T); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + VariantContext vc = snpBuilder.alleles(alleles).make(); Assert.assertEquals(vc.getChr(), snpLoc); Assert.assertEquals(vc.getStart(), snpLocStart); @@ -175,8 +185,8 @@ public void testCreatingSNPVariantContext() { @Test public void testCreatingRefVariantContext() { - List alleles = Arrays.asList(Aref); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + List alleles = Arrays.asList(Aref); + VariantContext vc = snpBuilder.alleles(alleles).make(); Assert.assertEquals(vc.getChr(), snpLoc); Assert.assertEquals(vc.getStart(), snpLocStart); @@ -202,7 +212,7 @@ public void testCreatingRefVariantContext() { @Test public void testCreatingDeletionVariantContext() { List alleles = Arrays.asList(ATCref, del); - VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).referenceBaseForIndel((byte)'A').make(); Assert.assertEquals(vc.getChr(), delLoc); Assert.assertEquals(vc.getStart(), delLocStart); @@ -229,7 +239,7 @@ public void testCreatingDeletionVariantContext() { @Test public void testCreatingInsertionVariantContext() { List alleles = Arrays.asList(delRef, ATC); - VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + VariantContext vc = insBuilder.alleles(alleles).make(); Assert.assertEquals(vc.getChr(), insLoc); Assert.assertEquals(vc.getStart(), insLocStart); @@ -255,18 +265,18 @@ public void testCreatingInsertionVariantContext() { @Test public void testCreatingPartiallyCalledGenotype() { List alleles = Arrays.asList(Aref, C); - Genotype g = new Genotype("foo", Arrays.asList(C, Allele.NO_CALL), 10); - VariantContext vc = new VariantContext("test", snpLoc, snpLocStart, snpLocStop, alleles, Arrays.asList(g)); + Genotype g = new Genotype("foo", Arrays.asList(C, Allele.NO_CALL)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g).make(); Assert.assertTrue(vc.isSNP()); Assert.assertEquals(vc.getNAlleles(), 2); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphic()); - Assert.assertTrue(vc.isPolymorphic()); + Assert.assertFalse(vc.isMonomorphicInSamples()); + Assert.assertTrue(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getGenotype("foo"), g); - Assert.assertEquals(vc.getChromosomeCount(), 2); // we know that there are 2 chromosomes, even though one isn't called - Assert.assertEquals(vc.getChromosomeCount(Aref), 0); - Assert.assertEquals(vc.getChromosomeCount(C), 1); + Assert.assertEquals(vc.getCalledChrCount(), 1); // we only have 1 called chromosomes, we exclude the NO_CALL one isn't called + Assert.assertEquals(vc.getCalledChrCount(Aref), 0); + Assert.assertEquals(vc.getCalledChrCount(C), 1); Assert.assertFalse(vc.getGenotype("foo").isHet()); Assert.assertFalse(vc.getGenotype("foo").isHom()); Assert.assertFalse(vc.getGenotype("foo").isNoCall()); @@ -275,55 +285,71 @@ public void testCreatingPartiallyCalledGenotype() { Assert.assertEquals(vc.getGenotype("foo").getType(), Genotype.Type.MIXED); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs1() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs2() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgs3() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Throwable.class) public void testBadConstructorArgs4() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Collections.emptyList()); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Collections.emptyList()).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgsDuplicateAlleles1() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)).make(); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = Exception.class) public void testBadConstructorArgsDuplicateAlleles2() { - new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)); + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)).make(); } - @Test (expectedExceptions = IllegalStateException.class) + @Test (expectedExceptions = Throwable.class) public void testBadLoc1() { List alleles = Arrays.asList(Aref, T, del); - new VariantContext("test", delLoc, delLocStart, delLocStop, alleles); + new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make(); + } + + @Test (expectedExceptions = Throwable.class) + public void testBadID1() { + new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id(null).make(); + } + + @Test (expectedExceptions = Exception.class) + public void testBadID2() { + new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id("").make(); + } + + @Test (expectedExceptions = Throwable.class) + public void testBadPError() { + new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).log10PError(0.5).make(); } @Test public void testAccessingSimpleSNPGenotypes() { List alleles = Arrays.asList(Aref, T); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - Genotype g3 = new Genotype("TT", Arrays.asList(T, T), 10); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1, g2, g3)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) + .genotypes(g1, g2, g3).make(); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphic()); - Assert.assertTrue(vc.isPolymorphic()); + Assert.assertFalse(vc.isMonomorphicInSamples()); + Assert.assertTrue(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getSampleNames().size(), 3); Assert.assertEquals(vc.getGenotypes().size(), 3); @@ -342,36 +368,37 @@ public void testAccessingSimpleSNPGenotypes() { Assert.assertFalse(vc.hasGenotype("at")); Assert.assertFalse(vc.hasGenotype("tt")); - Assert.assertEquals(vc.getChromosomeCount(), 6); - Assert.assertEquals(vc.getChromosomeCount(Aref), 3); - Assert.assertEquals(vc.getChromosomeCount(T), 3); + Assert.assertEquals(vc.getCalledChrCount(), 6); + Assert.assertEquals(vc.getCalledChrCount(Aref), 3); + Assert.assertEquals(vc.getCalledChrCount(T), 3); } @Test public void testAccessingCompleteGenotypes() { List alleles = Arrays.asList(Aref, T, del); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - Genotype g3 = new Genotype("TT", Arrays.asList(T, T), 10); - Genotype g4 = new Genotype("Td", Arrays.asList(T, del), 10); - Genotype g5 = new Genotype("dd", Arrays.asList(del, del), 10); - Genotype g6 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 10); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + Genotype g4 = new Genotype("Td", Arrays.asList(T, del)); + Genotype g5 = new Genotype("dd", Arrays.asList(del, del)); + Genotype g6 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1, g2, g3, g4, g5, g6)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) + .genotypes(g1, g2, g3, g4, g5, g6).make(); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphic()); - Assert.assertTrue(vc.isPolymorphic()); + Assert.assertFalse(vc.isMonomorphicInSamples()); + Assert.assertTrue(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getGenotypes().size(), 6); Assert.assertEquals(3, vc.getGenotypes(Arrays.asList("AA", "Td", "dd")).size()); - Assert.assertEquals(10, vc.getChromosomeCount()); - Assert.assertEquals(3, vc.getChromosomeCount(Aref)); - Assert.assertEquals(4, vc.getChromosomeCount(T)); - Assert.assertEquals(3, vc.getChromosomeCount(del)); - Assert.assertEquals(2, vc.getChromosomeCount(Allele.NO_CALL)); + Assert.assertEquals(10, vc.getCalledChrCount()); + Assert.assertEquals(3, vc.getCalledChrCount(Aref)); + Assert.assertEquals(4, vc.getCalledChrCount(T)); + Assert.assertEquals(3, vc.getCalledChrCount(del)); + Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL)); } @Test @@ -380,76 +407,79 @@ public void testAccessingRefGenotypes() { List alleles2 = Arrays.asList(Aref); List alleles3 = Arrays.asList(Aref, T, del); for ( List alleles : Arrays.asList(alleles1, alleles2, alleles3)) { - Genotype g1 = new Genotype("AA1", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AA2", Arrays.asList(Aref, Aref), 10); - Genotype g3 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 10); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1, g2, g3)); + Genotype g1 = new Genotype("AA1", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AA2", Arrays.asList(Aref, Aref)); + Genotype g3 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) + .genotypes(g1, g2, g3).make(); Assert.assertTrue(vc.hasGenotypes()); - Assert.assertTrue(vc.isMonomorphic()); - Assert.assertFalse(vc.isPolymorphic()); + Assert.assertTrue(vc.isMonomorphicInSamples()); + Assert.assertFalse(vc.isPolymorphicInSamples()); Assert.assertEquals(vc.getGenotypes().size(), 3); - Assert.assertEquals(4, vc.getChromosomeCount()); - Assert.assertEquals(4, vc.getChromosomeCount(Aref)); - Assert.assertEquals(0, vc.getChromosomeCount(T)); - Assert.assertEquals(2, vc.getChromosomeCount(Allele.NO_CALL)); + Assert.assertEquals(4, vc.getCalledChrCount()); + Assert.assertEquals(4, vc.getCalledChrCount(Aref)); + Assert.assertEquals(0, vc.getCalledChrCount(T)); + Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL)); } } @Test public void testFilters() { List alleles = Arrays.asList(Aref, T, del); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - MutableVariantContext vc = new MutableVariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, Arrays.asList(g1,g2)); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + + VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1, g2).make(); Assert.assertTrue(vc.isNotFiltered()); Assert.assertFalse(vc.isFiltered()); Assert.assertEquals(0, vc.getFilters().size()); + Assert.assertFalse(vc.filtersWereApplied()); + Assert.assertNull(vc.getFiltersMaybeNull()); - vc.addFilter("BAD_SNP_BAD!"); + vc = new VariantContextBuilder(vc).filters("BAD_SNP_BAD!").make(); Assert.assertFalse(vc.isNotFiltered()); Assert.assertTrue(vc.isFiltered()); Assert.assertEquals(1, vc.getFilters().size()); + Assert.assertTrue(vc.filtersWereApplied()); + Assert.assertNotNull(vc.getFiltersMaybeNull()); - vc.addFilters(Arrays.asList("REALLY_BAD_SNP", "CHRIST_THIS_IS_TERRIBLE")); + Set filters = new HashSet(Arrays.asList("BAD_SNP_BAD!", "REALLY_BAD_SNP", "CHRIST_THIS_IS_TERRIBLE")); + vc = new VariantContextBuilder(vc).filters(filters).make(); Assert.assertFalse(vc.isNotFiltered()); Assert.assertTrue(vc.isFiltered()); Assert.assertEquals(3, vc.getFilters().size()); - - vc.clearFilters(); - - Assert.assertTrue(vc.isNotFiltered()); - Assert.assertFalse(vc.isFiltered()); - Assert.assertEquals(0, vc.getFilters().size()); + Assert.assertTrue(vc.filtersWereApplied()); + Assert.assertNotNull(vc.getFiltersMaybeNull()); } @Test - public void testVCromGenotypes() { + public void testVCFfromGenotypes() { List alleles = Arrays.asList(Aref, T, del); - Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref), 10); - Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T), 10); - Genotype g3 = new Genotype("TT", Arrays.asList(T, T), 10); - Genotype g4 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 10); - Genotype g5 = new Genotype("--", Arrays.asList(del, del), 10); - VariantContext vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop , alleles, Arrays.asList(g1,g2,g3,g4,g5)); - - VariantContext vc12 = vc.subContextFromGenotypes(Arrays.asList(g1,g2)); - VariantContext vc1 = vc.subContextFromGenotypes(Arrays.asList(g1)); - VariantContext vc23 = vc.subContextFromGenotypes(Arrays.asList(g2, g3)); - VariantContext vc4 = vc.subContextFromGenotypes(Arrays.asList(g4)); - VariantContext vc14 = vc.subContextFromGenotypes(Arrays.asList(g1, g4)); - VariantContext vc5 = vc.subContextFromGenotypes(Arrays.asList(g5)); - - Assert.assertTrue(vc12.isPolymorphic()); - Assert.assertTrue(vc23.isPolymorphic()); - Assert.assertTrue(vc1.isMonomorphic()); - Assert.assertTrue(vc4.isMonomorphic()); - Assert.assertTrue(vc14.isMonomorphic()); - Assert.assertTrue(vc5.isPolymorphic()); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + Genotype g4 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + Genotype g5 = new Genotype("--", Arrays.asList(del, del)); + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); + + VariantContext vc12 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName(), g2.getSampleName()))); + VariantContext vc1 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName()))); + VariantContext vc23 = vc.subContextFromSamples(new HashSet(Arrays.asList(g2.getSampleName(), g3.getSampleName()))); + VariantContext vc4 = vc.subContextFromSamples(new HashSet(Arrays.asList(g4.getSampleName()))); + VariantContext vc14 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName(), g4.getSampleName()))); + VariantContext vc5 = vc.subContextFromSamples(new HashSet(Arrays.asList(g5.getSampleName()))); + + Assert.assertTrue(vc12.isPolymorphicInSamples()); + Assert.assertTrue(vc23.isPolymorphicInSamples()); + Assert.assertTrue(vc1.isMonomorphicInSamples()); + Assert.assertTrue(vc4.isMonomorphicInSamples()); + Assert.assertTrue(vc14.isMonomorphicInSamples()); + Assert.assertTrue(vc5.isPolymorphicInSamples()); Assert.assertTrue(vc12.isSNP()); Assert.assertTrue(vc12.isVariant()); @@ -476,12 +506,35 @@ public void testVCromGenotypes() { Assert.assertTrue(vc5.isVariant()); Assert.assertTrue(vc5.isBiallelic()); - Assert.assertEquals(3, vc12.getChromosomeCount(Aref)); - Assert.assertEquals(1, vc23.getChromosomeCount(Aref)); - Assert.assertEquals(2, vc1.getChromosomeCount(Aref)); - Assert.assertEquals(0, vc4.getChromosomeCount(Aref)); - Assert.assertEquals(2, vc14.getChromosomeCount(Aref)); - Assert.assertEquals(0, vc5.getChromosomeCount(Aref)); + Assert.assertEquals(3, vc12.getCalledChrCount(Aref)); + Assert.assertEquals(1, vc23.getCalledChrCount(Aref)); + Assert.assertEquals(2, vc1.getCalledChrCount(Aref)); + Assert.assertEquals(0, vc4.getCalledChrCount(Aref)); + Assert.assertEquals(2, vc14.getCalledChrCount(Aref)); + Assert.assertEquals(0, vc5.getCalledChrCount(Aref)); + } + + public void testGetGenotypeMethods() { + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + GenotypesContext gc = GenotypesContext.create(g1, g2, g3); + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + + Assert.assertEquals(vc.getGenotype("AA"), g1); + Assert.assertEquals(vc.getGenotype("AT"), g2); + Assert.assertEquals(vc.getGenotype("TT"), g3); + Assert.assertEquals(vc.getGenotype("CC"), null); + + Assert.assertEquals(vc.getGenotypes(), gc); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT")), Arrays.asList(g1, g2)); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "TT")), Arrays.asList(g1, g3)); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "TT")), Arrays.asList(g1, g2, g3)); + Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "CC")), Arrays.asList(g1, g2)); + + Assert.assertEquals(vc.getGenotype(0), g1); + Assert.assertEquals(vc.getGenotype(1), g2); + Assert.assertEquals(vc.getGenotype(2), g3); } // -------------------------------------------------------------------------------- @@ -520,7 +573,7 @@ public Object[][] mergeAllelesData() { @Test(dataProvider = "getAlleles") public void testMergeAlleles(GetAllelesTest cfg) { final List altAlleles = cfg.alleles.subList(1, cfg.alleles.size()); - final VariantContext vc = new VariantContext("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); + final VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles).referenceBaseForIndel((byte)'A').make(); Assert.assertEquals(vc.getAlleles(), cfg.alleles, "VC alleles not the same as input alleles"); Assert.assertEquals(vc.getNAlleles(), cfg.alleles.size(), "VC getNAlleles not the same as input alleles size"); @@ -550,4 +603,270 @@ public void testMergeAlleles(GetAllelesTest cfg) { Assert.assertFalse(vc.hasAllele(missingAllele)); Assert.assertFalse(vc.hasAllele(missingAllele, true)); } -} + + private class SitesAndGenotypesVC extends TestDataProvider { + VariantContext vc, copy; + + private SitesAndGenotypesVC(String name, VariantContext original) { + super(SitesAndGenotypesVC.class, name); + this.vc = original; + this.copy = new VariantContextBuilder(original).make(); + } + + public String toString() { + return String.format("%s input=%s", super.toString(), vc); + } + } + + @DataProvider(name = "SitesAndGenotypesVC") + public Object[][] MakeSitesAndGenotypesVCs() { + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + + VariantContext sites = new VariantContextBuilder("sites", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).make(); + VariantContext genotypes = new VariantContextBuilder(sites).source("genotypes").genotypes(g1, g2, g3).make(); + + new SitesAndGenotypesVC("sites", sites); + new SitesAndGenotypesVC("genotypes", genotypes); + + return SitesAndGenotypesVC.getTests(SitesAndGenotypesVC.class); + } + + // -------------------------------------------------------------------------------- + // + // Test modifying routines + // + // -------------------------------------------------------------------------------- + @Test(dataProvider = "SitesAndGenotypesVC") + public void runModifyVCTests(SitesAndGenotypesVC cfg) { + VariantContext modified = new VariantContextBuilder(cfg.vc).loc("chr2", 123, 123).make(); + Assert.assertEquals(modified.getChr(), "chr2"); + Assert.assertEquals(modified.getStart(), 123); + Assert.assertEquals(modified.getEnd(), 123); + + modified = new VariantContextBuilder(cfg.vc).id("newID").make(); + Assert.assertEquals(modified.getID(), "newID"); + + Set newFilters = Collections.singleton("newFilter"); + modified = new VariantContextBuilder(cfg.vc).filters(newFilters).make(); + Assert.assertEquals(modified.getFilters(), newFilters); + + modified = new VariantContextBuilder(cfg.vc).attribute("AC", 1).make(); + Assert.assertEquals(modified.getAttribute("AC"), 1); + modified = new VariantContextBuilder(modified).attribute("AC", 2).make(); + Assert.assertEquals(modified.getAttribute("AC"), 2); + modified = new VariantContextBuilder(modified).attributes(null).make(); + Assert.assertTrue(modified.getAttributes().isEmpty()); + + Genotype g1 = new Genotype("AA2", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT2", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT2", Arrays.asList(T, T)); + GenotypesContext gc = GenotypesContext.create(g1,g2,g3); + modified = new VariantContextBuilder(cfg.vc).genotypes(gc).make(); + Assert.assertEquals(modified.getGenotypes(), gc); + modified = new VariantContextBuilder(cfg.vc).noGenotypes().make(); + Assert.assertTrue(modified.getGenotypes().isEmpty()); + + // test that original hasn't changed + Assert.assertEquals(cfg.vc.getChr(), cfg.copy.getChr()); + Assert.assertEquals(cfg.vc.getStart(), cfg.copy.getStart()); + Assert.assertEquals(cfg.vc.getEnd(), cfg.copy.getEnd()); + Assert.assertEquals(cfg.vc.getAlleles(), cfg.copy.getAlleles()); + Assert.assertEquals(cfg.vc.getAttributes(), cfg.copy.getAttributes()); + Assert.assertEquals(cfg.vc.getID(), cfg.copy.getID()); + Assert.assertEquals(cfg.vc.getGenotypes(), cfg.copy.getGenotypes()); + Assert.assertEquals(cfg.vc.getLog10PError(), cfg.copy.getLog10PError()); + Assert.assertEquals(cfg.vc.getFilters(), cfg.copy.getFilters()); + } + + // -------------------------------------------------------------------------------- + // + // Test subcontext + // + // -------------------------------------------------------------------------------- + private class SubContextTest extends TestDataProvider { + Set samples; + boolean updateAlleles; + + private SubContextTest(Collection samples, boolean updateAlleles) { + super(SubContextTest.class); + this.samples = new HashSet(samples); + this.updateAlleles = updateAlleles; + } + + public String toString() { + return String.format("%s samples=%s updateAlleles=%b", super.toString(), samples, updateAlleles); + } + } + + @DataProvider(name = "SubContextTest") + public Object[][] MakeSubContextTest() { + for ( boolean updateAlleles : Arrays.asList(true, false)) { + new SubContextTest(Collections.emptySet(), updateAlleles); + new SubContextTest(Collections.singleton("MISSING"), updateAlleles); + new SubContextTest(Collections.singleton("AA"), updateAlleles); + new SubContextTest(Collections.singleton("AT"), updateAlleles); + new SubContextTest(Collections.singleton("TT"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "MISSING"), updateAlleles); + new SubContextTest(Arrays.asList("AA", "AT", "TT", "MISSING"), updateAlleles); + } + + return SubContextTest.getTests(SubContextTest.class); + } + + @Test(dataProvider = "SubContextTest") + public void runSubContextTest(SubContextTest cfg) { + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + + GenotypesContext gc = GenotypesContext.create(g1, g2, g3); + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + VariantContext sub = cfg.updateAlleles ? vc.subContextFromSamples(cfg.samples) : vc.subContextFromSamples(cfg.samples, vc.getAlleles()); + + // unchanged attributes should be the same + Assert.assertEquals(sub.getChr(), vc.getChr()); + Assert.assertEquals(sub.getStart(), vc.getStart()); + Assert.assertEquals(sub.getEnd(), vc.getEnd()); + Assert.assertEquals(sub.getLog10PError(), vc.getLog10PError()); + Assert.assertEquals(sub.getFilters(), vc.getFilters()); + Assert.assertEquals(sub.getID(), vc.getID()); + Assert.assertEquals(sub.getReferenceBaseForIndel(), vc.getReferenceBaseForIndel()); + Assert.assertEquals(sub.getAttributes(), vc.getAttributes()); + + Set expectedGenotypes = new HashSet(); + if ( cfg.samples.contains(g1.getSampleName()) ) expectedGenotypes.add(g1); + if ( cfg.samples.contains(g2.getSampleName()) ) expectedGenotypes.add(g2); + if ( cfg.samples.contains(g3.getSampleName()) ) expectedGenotypes.add(g3); + GenotypesContext expectedGC = GenotypesContext.copy(expectedGenotypes); + + // these values depend on the results of sub + if ( cfg.updateAlleles ) { + // do the work to see what alleles should be here, and which not + Set alleles = new HashSet(); + for ( final Genotype g : expectedGC ) alleles.addAll(g.getAlleles()); + if ( ! alleles.contains(Aref) ) alleles.add(Aref); // always have the reference + Assert.assertEquals(new HashSet(sub.getAlleles()), alleles); + } else { + // not updating alleles -- should be the same + Assert.assertEquals(sub.getAlleles(), vc.getAlleles()); + } + + // same sample names => success + Assert.assertEquals(sub.getGenotypes().getSampleNames(), expectedGC.getSampleNames()); + } + + // -------------------------------------------------------------------------------- + // + // Test sample name functions + // + // -------------------------------------------------------------------------------- + private class SampleNamesTest extends TestDataProvider { + List sampleNames; + List sampleNamesInOrder; + + private SampleNamesTest(List sampleNames, List sampleNamesInOrder) { + super(SampleNamesTest.class); + this.sampleNamesInOrder = sampleNamesInOrder; + this.sampleNames = sampleNames; + } + + public String toString() { + return String.format("%s samples=%s order=%s", super.toString(), sampleNames, sampleNamesInOrder); + } + } + + @DataProvider(name = "SampleNamesTest") + public Object[][] MakeSampleNamesTest() { + new SampleNamesTest(Arrays.asList("1"), Arrays.asList("1")); + new SampleNamesTest(Arrays.asList("2", "1"), Arrays.asList("1", "2")); + new SampleNamesTest(Arrays.asList("1", "2"), Arrays.asList("1", "2")); + new SampleNamesTest(Arrays.asList("1", "2", "3"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("2", "1", "3"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("2", "3", "1"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("3", "1", "2"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("3", "2", "1"), Arrays.asList("1", "2", "3")); + new SampleNamesTest(Arrays.asList("NA2", "NA1"), Arrays.asList("NA1", "NA2")); + return SampleNamesTest.getTests(SampleNamesTest.class); + } + + private final static void assertGenotypesAreInOrder(Iterable gIt, List names) { + int i = 0; + for ( final Genotype g : gIt ) { + Assert.assertEquals(g.getSampleName(), names.get(i), "Unexpected genotype ordering"); + i++; + } + } + + + @Test(dataProvider = "SampleNamesTest") + public void runSampleNamesTest(SampleNamesTest cfg) { + GenotypesContext gc = GenotypesContext.create(cfg.sampleNames.size()); + for ( final String name : cfg.sampleNames ) { + gc.add(new Genotype(name, Arrays.asList(Aref, T))); + } + + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + + // same sample names => success + Assert.assertEquals(vc.getSampleNames(), new HashSet(cfg.sampleNames), "vc.getSampleNames() = " + vc.getSampleNames()); + Assert.assertEquals(vc.getSampleNamesOrderedByName(), cfg.sampleNamesInOrder, "vc.getSampleNamesOrderedByName() = " + vc.getSampleNamesOrderedByName()); + + assertGenotypesAreInOrder(vc.getGenotypesOrderedByName(), cfg.sampleNamesInOrder); + assertGenotypesAreInOrder(vc.getGenotypesOrderedBy(cfg.sampleNames), cfg.sampleNames); + } + + @Test + public void testGenotypeCounting() { + Genotype noCall = new Genotype("nocall", Arrays.asList(Allele.NO_CALL)); + Genotype mixed = new Genotype("mixed", Arrays.asList(Aref, Allele.NO_CALL)); + Genotype homRef = new Genotype("homRef", Arrays.asList(Aref, Aref)); + Genotype het = new Genotype("het", Arrays.asList(Aref, T)); + Genotype homVar = new Genotype("homVar", Arrays.asList(T, T)); + + List allGenotypes = Arrays.asList(noCall, mixed, homRef, het, homVar); + final int nCycles = allGenotypes.size() * 10; + + for ( int i = 0; i < nCycles; i++ ) { + int nNoCall = 0, nNoCallAlleles = 0, nA = 0, nT = 0, nMixed = 0, nHomRef = 0, nHet = 0, nHomVar = 0; + int nSamples = 0; + GenotypesContext gc = GenotypesContext.create(); + for ( int j = 0; j < i; j++ ) { + nSamples++; + Genotype g = allGenotypes.get(j % allGenotypes.size()); + final String name = String.format("%s_%d%d", g.getSampleName(), i, j); + gc.add(new Genotype(name, g.getAlleles())); + switch ( g.getType() ) { + case NO_CALL: nNoCall++; nNoCallAlleles++; break; + case HOM_REF: nA += 2; nHomRef++; break; + case HET: nA++; nT++; nHet++; break; + case HOM_VAR: nT += 2; nHomVar++; break; + case MIXED: nA++; nNoCallAlleles++; nMixed++; break; + default: throw new RuntimeException("Unexpected genotype type " + g.getType()); + } + + } + + VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); + Assert.assertEquals(vc.getNSamples(), nSamples); + if ( nSamples > 0 ) { + Assert.assertEquals(vc.isPolymorphicInSamples(), nT > 0); + Assert.assertEquals(vc.isMonomorphicInSamples(), nT == 0); + } + Assert.assertEquals(vc.getCalledChrCount(), nA + nT); + + Assert.assertEquals(vc.getCalledChrCount(Allele.NO_CALL), nNoCallAlleles); + Assert.assertEquals(vc.getCalledChrCount(Aref), nA); + Assert.assertEquals(vc.getCalledChrCount(T), nT); + + Assert.assertEquals(vc.getNoCallCount(), nNoCall); + Assert.assertEquals(vc.getHomRefCount(), nHomRef); + Assert.assertEquals(vc.getHetCount(), nHet); + Assert.assertEquals(vc.getHomVarCount(), nHomVar); + Assert.assertEquals(vc.getMixedCount(), nMixed); + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index 845d9c216d..ccf560f831 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -26,6 +26,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; @@ -98,9 +99,7 @@ private VariantContext makeVC(String source, List alleles, Collection alleles, Collection genotypes, Set filters) { int start = 10; int stop = start; // alleles.contains(ATC) ? start + 3 : start; - return new VariantContext(source, "1", start, stop, alleles, - genotypes == null ? null : VariantContext.genotypeCollectionToMap(new TreeMap(), genotypes), - 1.0, filters, null, Cref.getBases()[0]); + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).referenceBaseForIndel(Cref.getBases()[0]).make(); } // -------------------------------------------------------------------------------- @@ -246,20 +245,18 @@ public Object[][] createSimpleMergeRSIDData() { @Test(dataProvider = "simplemergersiddata") public void testRSIDMerge(SimpleMergeRSIDTest cfg) { - final VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); + VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); final List inputs = new ArrayList(); for ( final String id : cfg.inputs ) { - MutableVariantContext vc = new MutableVariantContext(snpVC1); - if ( ! id.equals(".") ) vc.setID(id); - inputs.add(vc); + inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); } final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser, inputs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); - Assert.assertEquals(merged.getID(), cfg.expected.equals(".") ? null : cfg.expected); + Assert.assertEquals(merged.getID(), cfg.expected); } // -------------------------------------------------------------------------------- @@ -412,44 +409,44 @@ public String toString() { @DataProvider(name = "mergeGenotypes") public Object[][] mergeGenotypesData() { new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1), makeG("s2", Aref, T, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); new MergeGenotypesTest("PreserveNoCall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, 2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1), makeG("s2", Aref, T, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); new MergeGenotypesTest("PerserveAlleles", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, 2)), - makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1), makeG("s2", Aref, C, 2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), + makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1), makeG("s3", Aref, T, 3))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); // // merging genothpes with PLs @@ -457,41 +454,41 @@ public Object[][] mergeGenotypesData() { // first, do no harm new MergeGenotypesTest("OrderedPLs", "1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3)), - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1, 1, 2, 3))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); // first, do no harm new MergeGenotypesTest("OrderedPLs-3Alleles", "1", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6))); + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); // first, do no harm new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6))); + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); // first, do no harm new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, 1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, 1, 1, 2, 3, 4, 5, 6))); + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2))); + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", - makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, 1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2,4,0,2), makeG("s3", Aref, T, 3,3,0,2)), + makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, 1), makeG("s3", Aref, T, 3))); + makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1, 1, 2, 3, 4, 5, 6)), - makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, 2, 6, 5, 4, 3, 2, 1)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, 1), makeG("s2", Aref, T, 2))); + makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); return MergeGenotypesTest.getTests(MergeGenotypesTest.class); } @@ -510,7 +507,7 @@ public void testMergeGenotypes(MergeGenotypesTest cfg) { } // necessary to not overload equals for genotypes - private void assertGenotypesAreMostlyEqual(Map actual, Map expected) { + private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { if (actual == expected) { return; } @@ -523,13 +520,11 @@ private void assertGenotypesAreMostlyEqual(Map actual, Map actual, Map(Arrays.asList("s1.1", "s1.2"))); + Assert.assertEquals(merged.getSampleNames(), new HashSet(Arrays.asList("s1.1", "s1.2"))); } @Test(expectedExceptions = UserException.class) public void testMergeGenotypesRequireUnique() { - final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)); - final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)); + final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); + final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser, Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java index b5f6b1b1ac..6f5756bdc1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContextUnitTest.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.variantcontext; import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; @@ -143,7 +144,7 @@ public void testClear() { private JEXLMap getVarContext() { List alleles = Arrays.asList(Aref, T); - VariantContext vc = new VariantContext("test", snpLoc.getContig(), snpLoc.getStart(), snpLoc.getStop(), alleles); + VariantContext vc = new VariantContextBuilder("test", snpLoc.getContig(), snpLoc.getStart(), snpLoc.getStop(), alleles).make(); return new JEXLMap(Arrays.asList(exp),vc); } diff --git a/public/packages/PicardPrivate.xml b/public/packages/PicardPrivate.xml index 581c479792..a800294d66 100644 --- a/public/packages/PicardPrivate.xml +++ b/public/packages/PicardPrivate.xml @@ -12,15 +12,12 @@ - - - diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index ccbe648d64..621afe8170 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -83,6 +83,10 @@ class DataProcessingPipeline extends QScript { @Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false) var defaultPlatform: String = "" + @Hidden + @Input(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false) + var testMode: Boolean = false + /**************************************************************************** * Global Variables @@ -335,6 +339,7 @@ class DataProcessingPipeline extends QScript { this.known ++= qscript.indels this.consensusDeterminationModel = cleanModelEnum this.compress = 0 + this.noPGTag = qscript.testMode; this.scatterCount = nContigs this.analysisName = queueLogDir + outBam + ".clean" this.jobName = queueLogDir + outBam + ".clean" @@ -360,6 +365,7 @@ class DataProcessingPipeline extends QScript { this.out = outBam if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString) else if (qscript.intervals != null) this.intervals :+= qscript.intervals + this.no_pg_tag = qscript.testMode this.scatterCount = nContigs this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 036a77b580..8c9063c293 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -314,7 +314,7 @@ class GATKResourcesBundle extends QScript { class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { this.variant = dbsnp - this.select ++= List("\"dbSNPBuildID <= 129\"") + this.select ++= List("dbSNPBuildID <= 129") this.reference_sequence = ref this.out = dbsnp129 } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index da02c8ac5e..c06601a2d2 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -248,10 +248,10 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.V = t.rawIndelVCF this.out = t.filteredIndelVCF this.filterName ++= List("IndelQD", "IndelReadPosRankSum", "IndelFS") - this.filterExpression ++= List("\"QD < 2.0\"", "\"ReadPosRankSum < -20.0\"", "\"FS > 200.0\"") + this.filterExpression ++= List("QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0") if (t.nSamples >= 10) { this.filterName ++= List("IndelInbreedingCoeff") - this.filterExpression ++= List("\"InbreedingCoeff < -0.8\"") + this.filterExpression ++= List("InbreedingCoeff < -0.8") } this.analysisName = t.name + "_VF" this.jobName = queueLogDir + t.name + ".indelfilter" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index 1d3fb26229..4896eaed3c 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -47,6 +47,10 @@ class PacbioProcessingPipeline extends QScript { @Input(shortName="bwastring", required=false) var bwastring: String = "" + @Hidden + @Input(shortName = "test", fullName = "test_mode", required = false) + var testMode: Boolean = false + val queueLogDir: String = ".qlog/" def script = { @@ -170,6 +174,7 @@ class PacbioProcessingPipeline extends QScript { this.input_file :+= inBam this.recal_file = inRecalFile this.out = outBam + this.no_pg_tag = testMode this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala index 4ca3cbb894..1493760189 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala @@ -19,7 +19,7 @@ class ExampleCountLoci extends QScript { @Output var out: File = _ - def script = { + def script() { val countLoci = new CountLoci countLoci.reference_sequence = referenceFile countLoci.input_file = bamFiles diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala index 9fdd1ba4c1..7f9d3f87a2 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala @@ -24,7 +24,7 @@ class ExampleCountReads extends QScript { /** * In script, you create and then add() functions to the pipeline. */ - def script = { + def script() { // Run CountReads for all bams jointly. @@ -41,6 +41,9 @@ class ExampleCountReads extends QScript { // matches the full form of the argument, but will actually be a scala List[] jointCountReads.input_file = bamFiles + // Set the memory limit. Also acts as a memory request on LSF and GridEngine. + jointCountReads.memoryLimit = 1 + // Add the newly created function to the pipeline. add(jointCountReads) @@ -51,6 +54,7 @@ class ExampleCountReads extends QScript { singleCountReads.reference_sequence = referenceFile // ':+' is the scala List append operator singleCountReads.input_file :+= bamFile + singleCountReads.memoryLimit = 1 add(singleCountReads) } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala index d3796d350e..d30668c193 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala @@ -24,7 +24,7 @@ class ExampleCustomWalker extends QScript { /** * In script, you create and then add() functions to the pipeline. */ - def script = { + def script() { val customWalker = new CommandLineGATK { // Set the name of your walker, for example this will be passed as -T MyCustomWalker this.analysis_type = "MyCustomWalker" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala index 9bddfd97c2..8cb86db0b2 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala @@ -33,7 +33,6 @@ class ExampleUnifiedGenotyper extends QScript { @Argument(doc="An optional list of filter expressions.", shortName="filterExpression", required=false) var filterExpressions: List[String] = Nil - // This trait allows us set the variables below in one place, // and then reuse this trait on each CommandLineGATK function below. trait UnifiedGenotyperArguments extends CommandLineGATK { @@ -62,7 +61,7 @@ class ExampleUnifiedGenotyper extends QScript { variantFilter.variant = genotyper.out variantFilter.out = swapExt(qscript.bamFile, "bam", "filtered.vcf") variantFilter.filterName = filterNames - variantFilter.filterExpression = filterExpressions.map("\"" + _ + "\"") + variantFilter.filterExpression = filterExpressions evalFiltered.eval :+= variantFilter.out evalFiltered.out = swapExt(variantFilter.out, "vcf", "eval") diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index e8091cde7d..32913deb47 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -26,7 +26,6 @@ package org.broadinstitute.sting.queue import function.QFunction import java.io.File -import java.util.Arrays import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ import org.broadinstitute.sting.queue.engine.{QGraphSettings, QGraph} @@ -34,6 +33,9 @@ import collection.JavaConversions._ import org.broadinstitute.sting.utils.classloader.PluginManager import org.broadinstitute.sting.utils.exceptions.UserException import org.broadinstitute.sting.utils.io.IOUtils +import org.broadinstitute.sting.utils.help.ApplicationDetails +import java.util.{ResourceBundle, Arrays} +import org.broadinstitute.sting.utils.text.TextFormattingUtils /** * Entry point of Queue. Compiles and runs QScripts passed in to the command line. @@ -87,7 +89,7 @@ class QCommandLine extends CommandLineProgram with Logging { private var shuttingDown = false private lazy val pluginManager = { - qScriptClasses = IOUtils.tempDir("Q-Classes", "", settings.qSettings.tempDirectory) + qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory) qScriptManager.loadScripts(scripts, qScriptClasses) new PluginManager[QScript](classOf[QScript], List(qScriptClasses.toURI.toURL)) } @@ -129,9 +131,11 @@ class QCommandLine extends CommandLineProgram with Logging { logger.info("Writing JobLogging GATKReport to file " + reportFile) QJobReport.printReport(qGraph.getFunctionsAndStatus(script.functions), reportFile) - val pdfFile = new File(jobStringName + ".pdf") - logger.info("Plotting JobLogging GATKReport to file " + pdfFile) - QJobReport.plotReport(reportFile, pdfFile) + if ( settings.run ) { + val pdfFile = new File(jobStringName + ".pdf") + logger.info("Plotting JobLogging GATKReport to file " + pdfFile) + QJobReport.plotReport(reportFile, pdfFile) + } } } } @@ -173,6 +177,42 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentTypeDescriptors = Arrays.asList(new ScalaCompoundArgumentTypeDescriptor) + override def getApplicationDetails : ApplicationDetails = { + new ApplicationDetails(createQueueHeader(), + List.empty[String], + ApplicationDetails.createDefaultRunningInstructions(getClass.asInstanceOf[Class[CommandLineProgram]]), + "") + } + + private def createQueueHeader() : List[String] = { + List(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), + "Copyright (c) 2011 The Broad Institute", + "Please view our documentation at http://www.broadinstitute.org/gsa/wiki", + "For support, please view our support site at http://getsatisfaction.com/gsa") + } + + private def getQueueVersion : String = { + var stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") + + if ( stingResources.containsKey("org.broadinstitute.sting.queue.QueueVersion.version") ) { + stingResources.getString("org.broadinstitute.sting.queue.QueueVersion.version") + } + else { + "" + } + } + + private def getBuildTimestamp : String = { + var stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("StingText") + + if ( stingResources.containsKey("build.timestamp") ) { + stingResources.getString("build.timestamp") + } + else { + "" + } + } + def shutdown() = { shuttingDown = true qGraph.shutdown() diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 648f9ffef5..e8ac26a574 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -62,6 +62,13 @@ class QSettings { @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) var residentRequest: Option[Double] = None + /** The name of the parallel environment (required for SGE, for example) */ + @Argument(fullName="job_parallel_env", shortName="jobParaEnv", doc="An SGE style parallel environment to use for jobs requesting more than 1 core. Equivalent to submitting jobs with -pe ARG nt for jobs with nt > 1", required=false) + var parallelEnvironmentName: String = "smp_pe" // Broad default + + @Argument(fullName="dontRequestMultipleCores", shortName="multiCoreJerk", doc="If provided, Queue will not request multiple processors for jobs using multiple processors. Sometimes you eat the bear, sometimes the bear eats you.", required=false) + var dontRequestMultipleCores: Boolean = false + @Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false) var runDirectory = new File(".") diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala index 2272619120..2aae2fc6bf 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala @@ -28,8 +28,8 @@ import org.broadinstitute.sting.queue.QException import org.broadinstitute.sting.queue.util.{Logging,Retry} import org.broadinstitute.sting.queue.function.CommandLineFunction import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} -import java.util.Collections import org.ggf.drmaa._ +import java.util.{Date, Collections} /** * Runs jobs using DRMAA. @@ -103,6 +103,18 @@ class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) ex case Session.QUEUED_ACTIVE => returnStatus = RunnerStatus.RUNNING case Session.DONE => val jobInfo: JobInfo = session.wait(jobId, Session.TIMEOUT_NO_WAIT) + + // Update jobInfo + def convertDRMAATime(key: String): Date = { + val v = jobInfo.getResourceUsage.get(key) + if ( v != null ) new Date(v.toString.toDouble.toLong * 1000) else null; + } + if ( jobInfo.getResourceUsage != null ) { + getRunInfo.startTime = convertDRMAATime("start_time") + getRunInfo.doneTime = convertDRMAATime("end_time") + getRunInfo.exechosts = "unknown" + } + if ((jobInfo.hasExited && jobInfo.getExitStatus != 0) || jobInfo.hasSignaled || jobInfo.wasAborted) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index 96e3ffd950..fca92a7a17 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -52,13 +52,28 @@ class GridEngineJobRunner(session: Session, function: CommandLineFunction) exten nativeSpec += " -q " + function.jobQueue // If the resident set size is requested pass on the memory request - if (function.residentRequest.isDefined) - nativeSpec += " -l mem_free=%dM".format(function.residentRequest.map(_ * 1024).get.ceil.toInt) + // NOTE: 12/20/11: depristo commented this out because mem_free isn't + // such a standard feature in SGE (gsa-engineering queue doesn't support it) + // requiring it can make SGE not so usable. It's dangerous to not enforce + // that we have enough memory to run our jobs, but I'd rather be dangerous + // than not be able to run my jobs at all. +// if (function.residentRequest.isDefined) +// nativeSpec += " -l mem_free=%dM".format(function.residentRequest.map(_ * 1024).get.ceil.toInt) // If the resident set size limit is defined specify the memory limit if (function.residentLimit.isDefined) nativeSpec += " -l h_rss=%dM".format(function.residentLimit.map(_ * 1024).get.ceil.toInt) + // If more than 1 core is requested, set the proper request + // if we aren't being jerks and just stealing cores (previous behavior) + if ( function.nCoresRequest.getOrElse(1) > 1 ) { + if ( function.qSettings.dontRequestMultipleCores ) + logger.warn("Sending multicore job %s to farm without requesting appropriate number of cores (%d)".format( + function.jobName, function.nCoresRequest.get)) + else + nativeSpec += " -pe %s %d".format(function.qSettings.parallelEnvironmentName, function.nCoresRequest.get) + } + // Pass on any job resource requests nativeSpec += function.jobResourceRequests.map(" -l " + _).mkString @@ -70,6 +85,7 @@ class GridEngineJobRunner(session: Session, function: CommandLineFunction) exten if (priority.isDefined) nativeSpec += " -p " + priority.get + logger.debug("Native spec is: %s".format(nativeSpec)) (nativeSpec + " " + super.functionNativeSpec).trim() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 323cc63ffe..5ef78500c8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -56,6 +56,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR private val selectString = new StringBuffer() private val usageString = new StringBuffer() private val requestString = new StringBuffer() + private val spanString = new StringBuffer() /** * Dispatches the function on the LSF cluster. @@ -100,6 +101,23 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR appendRequest("rusage", usageString, ",", "mem=%d".format(memInUnits)) } + // + // Request multiple cores on the same host. If nCoresRequest > 1, and we + // aren't being jerks and stealing cores, set numProcessors and maxNumProcessors + // and the span[host=1] parameters to get us exactly the right number of + // cores on a single host + // + if ( function.nCoresRequest.getOrElse(1) > 1 ) { + if ( function.qSettings.dontRequestMultipleCores ) + logger.warn("Sending multicore job %s to farm without requesting appropriate number of cores (%d)".format( + function.jobName, function.nCoresRequest.get)) + else { + request.numProcessors = function.nCoresRequest.get + request.maxNumProcessors = request.numProcessors + appendRequest("span", spanString, ",", "hosts=1") + } + } + val resReq = getResourceRequest if (resReq.length > 0) { request.resReq = resReq @@ -167,10 +185,12 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR requestString.setLength(0) selectString.setLength(0) usageString.setLength(0) + spanString.setLength(0) requestString.append(function.jobResourceRequests.mkString(" ")) extractSection(requestString, "select", selectString) extractSection(requestString, "rusage", usageString) + extractSection(requestString, "span", spanString) } private def extractSection(requestString: StringBuffer, section: String, sectionString: StringBuffer) { @@ -196,7 +216,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR sectionString.insert(sectionString.length() - 1, separator + request) } - private def getResourceRequest = "%s %s %s".format(selectString, usageString, requestString).trim() + private def getResourceRequest = "%s %s %s %s".format(selectString, usageString, spanString, requestString).trim() } object Lsf706JobRunner extends Logging { diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala index 9af4d9bcf1..deb83bf5a2 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/RodBind.scala @@ -28,21 +28,14 @@ object RodBind { def apply(trackName: String, trackType: String, file: File, tag: String) = new RodBind(trackName, trackType, file, tag) def apply(trackName: String, trackType: String, file: File) = new RodBind(trackName, trackType, file, null) - /** - * Formats the rod binding on the command line. - * Used for optional and repeat. - * @param cmdLineParam command line parameter, ex: -B - * @param prefix unused - * @param value RodBind to add. - * @param suffix unused - * @return The command line addition. - */ - def formatCommandLine(cmdLineParam: String)(prefix: String, value: Any, suffix: String) = { + def formatCommandLineParameter( cmdLineParam: String, value: Any ) = { value match { case rodBind: RodBind if (rodBind.tag != null) => - " %s:%s,%s,%s %s".format(cmdLineParam, rodBind.trackName, rodBind.trackType, rodBind.tag, rodBind.getPath) + "%s:%s,%s,%s".format(cmdLineParam, rodBind.trackName, rodBind.trackType, rodBind.tag) case rodBind: RodBind => - " %s:%s,%s %s".format(cmdLineParam, rodBind.trackName, rodBind.trackType, rodBind.getPath) + "%s:%s,%s".format(cmdLineParam, rodBind.trackName, rodBind.trackType) + case x => + "" } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala index 2951999938..940985f92e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/TaggedFile.scala @@ -2,6 +2,7 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File import org.broadinstitute.sting.utils.io.FileExtension +import org.broadinstitute.sting.queue.util.ShellUtils /** * Used to provide tagged -I input_file arguments to the GATK. @@ -19,21 +20,14 @@ object TaggedFile { def apply(path: String, tag: String) = new TaggedFile(path, tag) def apply(file: File, tag: String) = new TaggedFile(file, tag) - /** - * Formats the rod binding on the command line. - * Used for optional and repeat. - * @param cmdLineParam command line parameter, ex: -I - * @param prefix unused - * @param value TaggedFile to add. - * @param suffix unused - * @return The command line addition. - */ - def formatCommandLine(cmdLineParam: String)(prefix: String, value: Any, suffix: String) = { + def formatCommandLineParameter(cmdLineParam: String, value: Any) = { value match { case taggedFile: TaggedFile if (taggedFile.tag != null) => - " %s:%s %s".format(cmdLineParam, taggedFile.tag, taggedFile.getPath) + "%s:%s".format(cmdLineParam, taggedFile.tag) case file: File => - " %s %s".format(cmdLineParam, file.getPath) + cmdLineParam + case x => + "" } } } diff --git a/public/java/src/net/sf/samtools/GATKBinList.java b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala similarity index 57% rename from public/java/src/net/sf/samtools/GATKBinList.java rename to public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala index b53062aaff..d90db0de40 100644 --- a/public/java/src/net/sf/samtools/GATKBinList.java +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala @@ -22,30 +22,27 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package net.sf.samtools; +package org.broadinstitute.sting.queue.extensions.gatk -import java.util.BitSet; +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline.{Output, Argument, Input} +import java.io.File +import org.broadinstitute.sting.utils.interval.IntervalUtils -/** - * A temporary solution to work around Java access rights issues: - * override chunk and make it public. - * TODO: Eliminate once we determine the final fate of the BAM index reading code. - */ -public class GATKBinList extends BinList { - /** - * Create a new BinList over sequenceCount sequences, consisting of the given bins. - * @param referenceSequence Reference sequence to which these bins are relevant. - * @param bins The given bins to include. - */ - public GATKBinList(final int referenceSequence, final BitSet bins) { - super(referenceSequence,bins); - } +class WriteFlankingIntervalsFunction extends InProcessFunction { + @Input(doc="The reference sequence") + var reference : File = _ + + @Input(doc="The interval list to flank") + var inputIntervals : File = _ + + @Output(doc="The output intervals file to write to") + var outputIntervals: File = _ + + @Argument(doc="Number of base pair to flank the input intervals") + var flankSize : Int = _ - /** - * Retrieves the bins stored in this list. - * @return A bitset where a bin is present in the list if the bit is true. - */ - public BitSet getBins() { - return super.getBins(); - } + def run() { + IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize) + } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala index 5456ed02c0..93735e4ac2 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala @@ -55,11 +55,11 @@ class AddOrReplaceReadGroups extends org.broadinstitute.sting.queue.function.Jav override def outputBam = output this.createIndex = Some(true) override def commandLine = super.commandLine + - " RGID=" + RGID + - " RGLB=" + RGLB + - " RGPL=" + RGPL + - " RGPU=" + RGPU + - " RGSM=" + RGSM + - conditionalParameter(RGCN != null && !RGCN.isEmpty, " RGCN=" + RGCN) + - conditionalParameter(RGDS != null && !RGDS.isEmpty, " RGDS=" + RGDS) + required("RGID=" + RGID) + + required("RGLB=" + RGLB) + + required("RGPL=" + RGPL) + + required("RGPU=" + RGPU) + + required("RGSM=" + RGSM) + + conditional(RGCN != null && !RGCN.isEmpty, "RGCN=" + RGCN) + + conditional(RGDS != null && !RGDS.isEmpty, "RGDS=" + RGDS) } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala index d44d5e004a..d73c556af7 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala @@ -47,10 +47,8 @@ class MarkDuplicates extends org.broadinstitute.sting.queue.function.JavaCommand this.sortOrder = null this.createIndex = Some(true) override def commandLine = super.commandLine + - " M=" + metrics + - conditionalParameter(REMOVE_DUPLICATES, " REMOVE_DUPLICATES=true") + - conditionalParameter(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP > 0, " MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=" + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP.toString) + - conditionalParameter(SORTING_COLLECTION_SIZE_RATIO > 0, " SORTING_COLLECTION_SIZE_RATIO=" + SORTING_COLLECTION_SIZE_RATIO.toString) - - + required("M=" + metrics) + + conditional(REMOVE_DUPLICATES, "REMOVE_DUPLICATES=true") + + conditional(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP > 0, "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=" + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP.toString) + + conditional(SORTING_COLLECTION_SIZE_RATIO > 0, "SORTING_COLLECTION_SIZE_RATIO=" + SORTING_COLLECTION_SIZE_RATIO.toString) } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala index fd107890e1..036932cc68 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala @@ -44,9 +44,7 @@ class MergeSamFiles extends org.broadinstitute.sting.queue.function.JavaCommandL override def outputBam = output this.createIndex = Some(true) override def commandLine = super.commandLine + - conditionalParameter(MERGE_SEQUENCE_DICTIONARIES, " MERGE_SEQUENCE_DICTIONARIES=true") + - conditionalParameter(USE_THREADING, " USE_THREADING=true") + - conditionalParameter(COMMENT != null && !COMMENT.isEmpty, " COMMENT=" + COMMENT) - - + conditional(MERGE_SEQUENCE_DICTIONARIES, "MERGE_SEQUENCE_DICTIONARIES=true") + + conditional(USE_THREADING, "USE_THREADING=true") + + conditional(COMMENT != null && !COMMENT.isEmpty, "COMMENT=" + COMMENT) } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala index 427c09f827..76856dc366 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala @@ -48,14 +48,13 @@ trait PicardBamFunction extends JavaCommandLineFunction { protected def outputBam: File abstract override def commandLine = super.commandLine + - Array( - repeat(" INPUT=", inputBams), - " TMP_DIR=" + jobTempDir, - optional(" OUTPUT=", outputBam), - optional(" COMPRESSION_LEVEL=", compressionLevel), - optional(" VALIDATION_STRINGENCY=", validationStringency), - optional(" SO=", sortOrder), - optional(" MAX_RECORDS_IN_RAM=", maxRecordsInRam), - optional(" ASSUME_SORTED=", assumeSorted), - optional(" CREATE_INDEX=", createIndex)).mkString + repeat("INPUT=", inputBams, spaceSeparated=false) + + required("TMP_DIR=" + jobTempDir) + + optional("OUTPUT=", outputBam, spaceSeparated=false) + + optional("COMPRESSION_LEVEL=", compressionLevel, spaceSeparated=false) + + optional("VALIDATION_STRINGENCY=", validationStringency, spaceSeparated=false) + + optional("SO=", sortOrder, spaceSeparated=false) + + optional("MAX_RECORDS_IN_RAM=", maxRecordsInRam, spaceSeparated=false) + + optional("ASSUME_SORTED=", assumeSorted, spaceSeparated=false) + + optional("CREATE_INDEX=", createIndex, spaceSeparated=false) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala index 72489dc87a..b1968bee5a 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala @@ -42,7 +42,7 @@ class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLine this.createIndex = Some(true) this.sortOrder = null override def commandLine = super.commandLine + - " REFERENCE=" + sortReference + - optional(" ALLOW_INCOMPLETE_DICT_CONCORDANCE=", ALLOW_INCOMPLETE_DICT_CONCORDANCE) - optional(" ALLOW_CONTIG_LENGTH_DISCORDANCE=", ALLOW_CONTIG_LENGTH_DISCORDANCE) + required("REFERENCE=" + sortReference) + + optional("ALLOW_INCOMPLETE_DICT_CONCORDANCE=", ALLOW_INCOMPLETE_DICT_CONCORDANCE, spaceSeparated=false) + optional("ALLOW_CONTIG_LENGTH_DISCORDANCE=", ALLOW_CONTIG_LENGTH_DISCORDANCE, spaceSeparated=false) } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala index 746ce609e7..60d8bfaf81 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala @@ -52,10 +52,10 @@ class RevertSam extends org.broadinstitute.sting.queue.function.JavaCommandLineF override def outputBam = output this.createIndex = Some(true) override def commandLine = super.commandLine + - conditionalParameter(!restoreOriginalQualities, " RESTORE_ORIGINAL_QUALITIES=false") + - conditionalParameter(!removeDuplicateInformation, " REMOVE_DUPLICATE_INFORMATION=false") + - conditionalParameter(!removeAlignmentInformation, " REMOVE_ALIGNMENT_INFORMATION=false") + - conditionalParameter(!attributesToClear.isEmpty, repeat(" ATTRIBUTE_TO_CLEAR=", attributesToClear)) + - conditionalParameter(sampleAlias != null, " SAMPLE_ALIAS=" + sampleAlias) + - conditionalParameter(libraryName != null, " LIBRARY_NAME=" + libraryName) + conditional(!restoreOriginalQualities, "RESTORE_ORIGINAL_QUALITIES=false") + + conditional(!removeDuplicateInformation, "REMOVE_DUPLICATE_INFORMATION=false") + + conditional(!removeAlignmentInformation, "REMOVE_ALIGNMENT_INFORMATION=false") + + repeat("ATTRIBUTE_TO_CLEAR=", attributesToClear, spaceSeparated=false) + // repeat() returns "" for null/empty list + conditional(sampleAlias != null, "SAMPLE_ALIAS=" + sampleAlias) + + conditional(libraryName != null, "LIBRARY_NAME=" + libraryName) } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala index 3a4217e605..3eb4e8e064 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala @@ -61,17 +61,17 @@ class SamToFastq extends org.broadinstitute.sting.queue.function.JavaCommandLine this.sortOrder = null override def commandLine = super.commandLine + - " FASTQ=" + fastq + - optional(" SECOND_END_FASTQ=", secondEndFastQ) + - conditionalParameter(outputPerReadGroup, optional(" OUTPUT_PER_RG=", outputPerReadGroup)) + - optional(" OUTPUT_DIR=", outputDir) + - conditionalParameter(!reReverse, optional(" RE_REVERSE=", reReverse)) + - conditionalParameter(includeNonPFReads, optional(" INCLUDE_NON_PF_READS=", includeNonPFReads)) + - optional(" CLIPPING_ATTRIBUTE=", clippingAttribute) + - optional(" CLIPPING_ACTION=", clippingAction) + - conditionalParameter (readOneTrim >= 0, optional(" READ1_TRIM=", readOneTrim)) + - conditionalParameter (readOneMaxBasesToWrite >= 0, optional(" READ1_MAX_BASES_TO_WRITE=", readOneMaxBasesToWrite)) + - conditionalParameter (readTwoTrim >= 0, optional(" READ2_TRIM=", readTwoTrim)) + - conditionalParameter (readTwoMaxBasesToWrite >=0, optional(" READ2_MAX_BASES_TO_WRITE=", readTwoMaxBasesToWrite)) + - conditionalParameter (includeNonPrimaryAlignments, optional(" INCLUDE_NON_PRIMARY_ALIGNMENTS=", includeNonPrimaryAlignments)) + required("FASTQ=" + fastq) + + optional("SECOND_END_FASTQ=", secondEndFastQ, spaceSeparated=false) + + conditional(outputPerReadGroup, "OUTPUT_PER_RG=" + outputPerReadGroup) + + optional("OUTPUT_DIR=", outputDir, spaceSeparated=false) + + conditional(!reReverse, "RE_REVERSE=" + reReverse) + + conditional(includeNonPFReads, "INCLUDE_NON_PF_READS=" + includeNonPFReads) + + optional("CLIPPING_ATTRIBUTE=", clippingAttribute, spaceSeparated=false) + + optional("CLIPPING_ACTION=", clippingAction, spaceSeparated=false) + + conditional(readOneTrim >= 0, "READ1_TRIM=" + readOneTrim) + + conditional(readOneMaxBasesToWrite >= 0, "READ1_MAX_BASES_TO_WRITE=" + readOneMaxBasesToWrite) + + conditional(readTwoTrim >= 0, "READ2_TRIM=" + readTwoTrim) + + conditional(readTwoMaxBasesToWrite >= 0, "READ2_MAX_BASES_TO_WRITE=" + readTwoMaxBasesToWrite) + + conditional(includeNonPrimaryAlignments, "INCLUDE_NON_PRIMARY_ALIGNMENTS=" + includeNonPrimaryAlignments) } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala index 2c8fbc6d95..030e4b07d3 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala @@ -50,11 +50,11 @@ class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaComman override def inputBams = input override def outputBam = output override def commandLine = super.commandLine + - " MODE=" + MODE + - " MAX_OUTPUT=" + MAX_OUTPUT + - " MAX_OPEN_TEMP_FILES=" + MAX_OPEN_TEMP_FILES + - conditionalParameter(!VALIDATE_INDEX, " VALIDATE_INDEX=false") + - conditionalParameter(IGNORE_WARNINGS, " IGNORE_WARNINGS=true") + - conditionalParameter(IS_BISULFITE_SEQUENCED, " IS_BISULFITE_SEQUENCED=true") + - conditionalParameter(IGNORE != null && !IGNORE.isEmpty, repeat(" IGNORE=", IGNORE)) + required("MODE=" + MODE) + + required("MAX_OUTPUT=" + MAX_OUTPUT) + + required("MAX_OPEN_TEMP_FILES=" + MAX_OPEN_TEMP_FILES) + + conditional(!VALIDATE_INDEX, "VALIDATE_INDEX=false") + + conditional(IGNORE_WARNINGS, "IGNORE_WARNINGS=true") + + conditional(IS_BISULFITE_SEQUENCED, "IS_BISULFITE_SEQUENCED=true") + + repeat("IGNORE=", IGNORE, spaceSeparated=false) // repeat() returns "" for null/empty list } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala index 801a152ec3..83a03b904e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsIndexFunction.scala @@ -48,7 +48,10 @@ class SamtoolsIndexFunction extends SamtoolsCommandLineFunction { bamFileIndex = new File(bamFile.getPath + ".bai") } - def commandLine = "%s index %s %s".format(samtools, bamFile, bamFileIndex) + def commandLine = required(samtools) + + required("index") + + required(bamFile) + + required(bamFileIndex) override def dotString = "Index: %s".format(bamFile.getName) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala index 2b864def64..aff9a25c0d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/samtools/SamtoolsMergeFunction.scala @@ -55,7 +55,9 @@ class SamtoolsMergeFunction extends SamtoolsCommandLineFunction { )) } - def commandLine = "%s merge%s %s%s".format( - samtools, optional(" -R ", region), - outputBam, repeat(" ", inputBams)) + def commandLine = required(samtools) + + required("merge") + + optional("-R", region) + + required(outputBam) + + repeat(inputBams) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala index 62f66ec066..259856c178 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala @@ -50,11 +50,14 @@ class SnpEff extends JavaCommandLineFunction { @Output(doc="snp eff output") var outVcf: File = _ - override def commandLine = Array( - super.commandLine, - " eff", - if (verbose) " -v" else "", - optional(" -c ", config), - " -i vcf -o vcf %s %s > %s".format(genomeVersion, inVcf, outVcf) - ).mkString + override def commandLine = super.commandLine + + required("eff") + + conditional(verbose, "-v") + + optional("-c", config) + + required("-i", "vcf") + + required("-o", "vcf") + + required(genomeVersion) + + required(inVcf) + + required(">", escape=false) + + required(outVcf) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index ff77503ac2..167dcb593f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -17,6 +17,9 @@ trait CommandLineFunction extends QFunction with Logging { /** Resident memory request */ var residentRequest: Option[Double] = None + /** the number of SMP cores this job wants */ + var nCoresRequest: Option[Int] = None + /** Job project to run the command */ var jobProject: String = _ @@ -45,6 +48,9 @@ trait CommandLineFunction extends QFunction with Logging { if (commandLineFunction.residentRequest.isEmpty) commandLineFunction.residentRequest = this.residentRequest + if (commandLineFunction.nCoresRequest.isEmpty) + commandLineFunction.nCoresRequest = this.nCoresRequest + if (commandLineFunction.jobProject == null) commandLineFunction.jobProject = this.jobProject @@ -100,6 +106,10 @@ trait CommandLineFunction extends QFunction with Logging { if (residentRequest.isEmpty) residentRequest = qSettings.residentRequest + // the default value is 1 core + if (nCoresRequest.isEmpty) + nCoresRequest = Some(1) + if (residentRequest.isEmpty) residentRequest = memoryLimit @@ -110,61 +120,178 @@ trait CommandLineFunction extends QFunction with Logging { } /** - * Repeats parameters with a prefix/suffix if they are set otherwise returns "". - * Skips null, Nil, None. Unwraps Some(x) to x. Everything else is called with x.toString. - * @param prefix Command line prefix per parameter. - * @param params Traversable parameters. - * @param suffix Optional suffix per parameter. - * @param separator Optional separator per parameter. - * @param format Format function if the value has a value - * @return The generated string + * Safely construct a full required command-line argument with consistent quoting, whitespace separation, etc. + * + * @param prefix Prefix to insert before the argument value (eg., "-f") + * @param param The argument value itself + * @param suffix Suffix to append after the argument value + * @param spaceSeparated If true, insert a space between the prefix, param, and suffix + * @param escape If true, quote the generated argument to avoid interpretation by the shell + * @param format Format String used to convert param to a String + * @return The combined and formatted argument, surrounded by whitespace */ - protected def repeat(prefix: String, params: Traversable[_], suffix: String = "", separator: String = "", - format: (String, Any, String) => String = formatValue("%s")) = - if (params == null) - "" - else - params.filter(param => hasValue(param)).map(param => format(prefix, param, suffix)).mkString(separator) + protected def required( prefix: String, param: Any, suffix: String = "", spaceSeparated: Boolean = true, + escape: Boolean = true, format: String = "%s" ): String = { + " %s ".format(formatArgument(prefix, param, suffix, spaceSeparated, escape, format)) + } + + /** + * Safely construct a one-token required command-line argument with quoting + * + * @param param The command-line argument value + * @return The argument value quoted and surrounded by whitespace + */ + protected def required( param: Any ): String = { + required("", param) + } + + /** + * Safely construct a one-token required command-line argument, and specify whether you want quoting + * + * @param param The command-line argument value + * @param escape If true, quote the generated argument to avoid interpretation by the shell + * @return The argument value, quoted if quoting was requested, and surrounded by whitespace + */ + protected def required( param: Any, escape: Boolean ): String = { + required("", param, escape=escape) + } + + /** + * Safely construct a full optional command-line argument with consistent quoting, whitespace separation, etc. + * If the argument has no value, returns an empty String. + * + * @param prefix Prefix to insert before the argument value (eg., "-f") + * @param param The argument value itself (if null/empty, the method returns empty String) + * @param suffix Suffix to append after the argument value + * @param spaceSeparated If true, insert a space between the prefix, param, and suffix + * @param escape If true, quote the generated argument to avoid interpretation by the shell + * @param format Format String used to convert param to a String + * @return The combined and formatted argument, surrounded by whitespace, or an empty String + * if the argument has no value + */ + protected def optional( prefix: String, param: Any, suffix: String = "", spaceSeparated: Boolean = true, + escape: Boolean = true, format: String = "%s" ): String = { + if ( hasValue(param) ) " %s ".format(formatArgument(prefix, param, suffix, spaceSeparated, escape, format)) else "" + } + + /** + * Safely construct a one-token optional command-line argument with quoting. + * If the argument has no value, returns an empty String. + * + * @param param The command-line argument value + * @return The argument value quoted and surrounded by whitespace, or an empty String + * if the argument has no value + */ + protected def optional( param: Any ): String = { + optional("", param) + } /** - * Returns parameter with a prefix/suffix if it is set otherwise returns "". - * Does not output null, Nil, None. Unwraps Some(x) to x. Everything else is called with x.toString. - * @param prefix Command line prefix per parameter. - * @param param Parameter to check for a value. - * @param suffix Optional suffix per parameter. - * @param format Format function if the value has a value - * @return The generated string + * Safely construct a one-token conditional command-line argument. If the provided condition + * is false, an empty String is returned. + * + * @param condition The condition to check + * @param param The command-line argument value + * @param escape If true, quote the generated argument to avoid interpretation by the shell + * @param format Format String used to convert param to a String + * @return The command-line argument value, quoted if quoting was requested and surrounded + * by whitespace, or an empty String if the argument has no value. */ - protected def optional(prefix: String, param: Any, suffix: String = "", - format: (String, Any, String) => String = formatValue("%s")) = - if (hasValue(param)) format(prefix, param, suffix) else "" + protected def conditional( condition: Boolean, param: Any, escape: Boolean = true, format: String = "%s" ): String = { + if ( condition ) { + " %s ".format(formatArgument("", param, "", false, escape, format)) + } + else { + "" + } + } /** - * Returns "" if the value is null or an empty collection, otherwise return the value.toString. - * @param format Format string if the value has a value - * @param prefix Command line prefix per parameter. - * @param param Parameter to check for a value. - * @param suffix Optional suffix per parameter. - * @return "" if the value is null, or "" if the collection is empty, otherwise the value.toString. + * Safely construct a series of full command-line arguments with consistent quoting, whitespace separation, etc. + * + * Each argument value is preceded by a prefix/suffix if they are set. A function can be provided to vary + * each prefix for each argument value (eg., -f:tag1 file1 -f:tag2 file2) -- the default is to use + * the same prefix for all arguments. + * + * @param prefix Prefix to insert before each argument value (eg., "-f") + * @param params The collection of argument values + * @param suffix Suffix to append after each argument value + * @param separator Specifies how to separate the various arguments from each other + * (eg., what should go between '-f' 'file1' and '-f' 'file2'?) + * Default is one space character. + * @param spaceSeparated If true, insert a space between each individual prefix, param, and suffix + * @param escape If true, quote the generated argument to avoid interpretation by the shell + * @param format Format String used to convert each individual param within params to a String + * @param formatPrefix Function mapping (prefix, argumentValue) pairs to prefixes. Can be used to + * vary each prefix depending on the argument value (useful for tags, etc.). + * Default is to use the same prefix for all argument values. + * @return The series of command-line arguments, quoted and whitespace-delimited as requested, + * or an empty String if params was null/Nil/None. */ - protected def formatValue(format: String)(prefix: String, param: Any, suffix: String): String = - if (CollectionUtils.isNullOrEmpty(param)) + protected def repeat(prefix: String, params: Traversable[_], suffix: String = "", separator: String = " ", + spaceSeparated: Boolean = true, escape: Boolean = true, format: String = "%s", + formatPrefix: (String, Any) => String = (prefix, value) => prefix): String = { + if (CollectionUtils.isNullOrEmpty(params)) "" else - prefix + (param match { - case Some(x) => format.format(x) - case x => format.format(x) - }) + suffix + " %s ".format(params.filter(param => hasValue(param)).map(param => formatArgument(formatPrefix(prefix, param), param, suffix, spaceSeparated, escape, format)).mkString(separator)) + } /** - * Returns the parameter if the condition is true. Useful for long string of parameters - * @param condition the condition to validate - * @param param the string to be returned in case condition is true - * @return param if condition is true, "" otherwise + * Safely construct a series of one-token command-line arguments with quoting and space separation. + * + * @param params The collection of argument values + * @return The argument values quoted and space-delimited, or an empty String if params was null/Nil/None */ - protected def conditionalParameter(condition: Boolean, param: String): String = - if (condition == true) - param - else - "" + protected def repeat( params: Traversable[_] ): String = { + repeat("", params) + } + + /** + * Given an (optional) prefix, an argument value, and an (optional) suffix, formats a command-line + * argument with the specified level of quoting and space-separation. + * + * Helper method for required(), optional(), conditional(), and repeat() -- do not use this + * method directly! + * + * @param prefix Prefix to insert before the argument value (eg., "-f"). Ignored if empty/null. + * @param param The argument value itself. If this is Some(x), it is unwrapped to x before processing. + * @param suffix Suffix to append after the argument value. Ignored if empty/null. + * @param spaceSeparated If true, insert a space between the prefix, param, and suffix + * @param escape If true, quote the generated argument to avoid interpretation by the shell + * @param paramFormat Format string used to convert param to a String + * @return The combined and formatted argument, NOT surrounded by any whitespace. + * Returns an empty String if param was null/empty. + */ + protected def formatArgument( prefix: String, param: Any, suffix: String, spaceSeparated: Boolean, escape: Boolean, + paramFormat: String ): String = { + if (CollectionUtils.isNullOrEmpty(param)) { + return "" + } + + // Trim leading and trailing whitespace off our three tokens, and unwrap Some(x) to x for the param + val trimmedValues : List[String] = List((if ( prefix != null ) prefix.trim else ""), + (param match { + case Some(x) => paramFormat.format(x).trim + case x => paramFormat.format(x).trim + }), + (if ( suffix != null ) suffix.trim else "")) + var joinedArgument : String = null + + // If the user requested space-separation, join the tokens with a space, and escape individual + // NON-EMPTY tokens if escaping was requested (eg., ("-f", "foo", "") -> "'-f' 'foo'") + if ( spaceSeparated ) { + joinedArgument = trimmedValues.map(x => if ( x.length > 0 && escape ) ShellUtils.escapeShellArgument(x) else x).mkString(" ").trim() + } + + // Otherwise join the tokens without any intervening whitespace, and if quoting was requested + // quote the entire concatenated value (eg., ("-Xmx", "4", "G") -> "'-Xmx4G'") + else { + joinedArgument = if ( escape ) ShellUtils.escapeShellArgument(trimmedValues.mkString("")) else trimmedValues.mkString("") + } + + // If the user requested escaping and we ended up with an empty String after joining, quote the empty + // String to preserve the command line token. Otherwise just return the joined argument + if ( joinedArgument.length == 0 && escape ) ShellUtils.escapeShellArgument(joinedArgument) else joinedArgument + } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index 4a50a72ac1..5b19cf9b66 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -49,18 +49,6 @@ trait JavaCommandLineFunction extends CommandLineFunction { */ var javaMemoryLimit: Option[Double] = None - /** - * Returns the java executable to run. - */ - def javaExecutable: String = { - if (jarFile != null) - "-jar " + jarFile - else if (javaMainClass != null) - "-cp \"%s\" %s".format(javaClasspath.mkString(File.pathSeparator), javaMainClass) - else - null - } - override def freezeFieldValues() { super.freezeFieldValues() @@ -71,11 +59,25 @@ trait JavaCommandLineFunction extends CommandLineFunction { javaClasspath = JavaCommandLineFunction.currentClasspath } - def javaOpts = "%s -Djava.io.tmpdir=%s" - .format(optional(" -Xmx", javaMemoryLimit.map(gb => (gb * 1024).ceil.toInt), "m"), jobTempDir) + /** + * Returns the java executable to run. + */ + def javaExecutable: String = { + if (jarFile != null) + required("-jar", jarFile) + else if (javaMainClass != null) + required("-cp", javaClasspath.mkString(File.pathSeparator)) + + required(javaMainClass) + else + null + } + + def javaOpts = optional("-Xmx", javaMemoryLimit.map(gb => (gb * 1024).ceil.toInt), "m", spaceSeparated=false) + + required("-Djava.io.tmpdir=", jobTempDir, spaceSeparated=false) - def commandLine = "java%s %s" - .format(javaOpts, javaExecutable) + def commandLine = required("java") + + javaOpts + + javaExecutable } object JavaCommandLineFunction { diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala deleted file mode 100755 index 77eb3ccbc4..0000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.text.XReadLines -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} -import collection.immutable.TreeSet - -// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created -class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction { - @Input(doc="The interval list to expand") val inList : File = in - @Input(doc="The reference sequence") val refDict : File = ref - @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start - @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size - @Output(doc="The output intervals file to write to") val outList : File = out - @Argument(doc="The output format for the intervals") val outTypeStr = opType - @Argument(doc="The input format for the intervals") val inTypeStr = ipType - - var output : PrintStream = _ - var parser : GenomeLocParser = _ - var xrl : XReadLines = _ - val outType = IntervalFormatType.convert(outTypeStr) - val inType = IntervalFormatType.convert(inTypeStr) - - var offsetIn : Int = 0 - var offsetOut : Int = 0 - - var first : Boolean = true - var lastTwo : (GenomeLoc,GenomeLoc) = _ - - var intervalCache : TreeSet[GenomeLoc] = _ - val LINES_TO_CACHE : Int = 1000 - - def run = { - output = new PrintStream(outList) - intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{ - def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) } - }) - parser = new GenomeLocParser(new FastaSequenceFile(ref,true)) - xrl = new XReadLines(inList) - offsetIn = if (isBed(inType)) 1 else 0 - offsetOut = if( isBed(outType)) 1 else 0 - var line : String = xrl.next - while ( line.startsWith("@") ) { - line = xrl.next - } - var prevLoc: GenomeLoc = null - var curLoc: GenomeLoc = null - var nextLoc : GenomeLoc = parseGenomeInterval(line) - var linesProcessed : Int = 1 - while ( prevLoc != null || curLoc != null || nextLoc != null ) { - prevLoc = curLoc - curLoc = nextLoc - nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null - if ( curLoc != null ) { - val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc) - val right: GenomeLoc = refine(expandRight(curLoc),nextLoc) - if ( left != null ) { - intervalCache += left - } - if ( right != null ) { - intervalCache += right - } - } - linesProcessed += 1 - if ( linesProcessed % LINES_TO_CACHE == 0 ) { - val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt)) - intervalCache = intervalCache -- toPrint - toPrint.foreach(u => output.print("%s%n".format(repr(u)))) - } - //System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc))) - } - - intervalCache.foreach(u => output.print("%s%n".format(repr(u)))) - - output.close() - } - - def expandLeft(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt) - } - - def expandRight(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt) - } - - def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = { - if ( borderG == null || ! newG.overlapsP(borderG) ) { - return newG - } else { - if ( newG.getStart < borderG.getStart ) { - if ( borderG.getStart - startInt > newG.getStart ) { - return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt) - } - } else { - if ( borderG.getStop + startInt < newG.getStop ){ - return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop) - } - } - } - - null - } - - def repr(loc : GenomeLoc) : String = { - if ( loc == null ) return "null" - if ( outType == IntervalFormatType.INTERVALS ) { - return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop) - } else { - return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut) - } - } - - def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = { - t == IntervalFormatType.BED - } - - def parseGenomeInterval( s : String ) : GenomeLoc = { - val sp = s.split("\\s+") - // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted - if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) - } - - object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { - type IntervalFormatType = Value - val INTERVALS,BED,TDF = Value - - def convert(s : String) : IntervalFormatType = { - if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF} - } - } -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala deleted file mode 100755 index e929477a1e..0000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala +++ /dev/null @@ -1,70 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import collection.JavaConversions._ -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary} -import org.broadinstitute.sting.utils.text.XReadLines -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} - -class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction { - @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals - @Output(doc="Output interval file to which to write") val output : File = outFile - @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false - @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed - - - var outStream : PrintStream = _ - var contigs : List[String] = Nil - var dict : SAMSequenceDictionary = _ - var parser : GenomeLocParser = _ - - def run = { - outStream = new PrintStream(output) - dict = new SAMSequenceDictionary - // note: memory hog - val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex - sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE))) - parser = new GenomeLocParser(dict) - sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h)) - outStream.close() - } - - def writeOut(g : GenomeLoc) : Unit = { - outStream.print("%s%n".format(g.toString)) - } - - def parse(s : String) : (String,Int,Int) = { - if ( s.contains(":") ) { - val split1 = s.split(":") - val split2 = split1(1).split("-") - return (split1(0),split2(0).toInt,split2(1).toInt) - } else { - val split = s.split("\\s+") - return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) ) - } - } - - def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = { - parser.createGenomeLoc(coords._1,coords._2,coords._3) - } - - def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = { - if ( a.last.forall(u => u._1.overlapsP(b._1)) ) { - a.init :+ (a.last :+ b) - } else { - a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b) - } - } - - def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = { - if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for - null - } - u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) ) - } - - def range(a: Int, b: Int) : Range = new Range(a,b+1,1) - -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index bb14bb6e6b..73d1c028a2 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -81,6 +81,10 @@ trait QJobReport extends Logging { this.reportFeatures = features.mapValues(_.toString) } + def addJobReportBinding(key: String, value: Any) { + this.reportFeatures += (key -> value.toString) + } + // copy the QJobReport information -- todo : what's the best way to do this? override def copySettingsTo(function: QFunction) { self.copySettingsTo(function) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ShellUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ShellUtils.scala new file mode 100644 index 0000000000..3cb1a705a8 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ShellUtils.scala @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.queue.util + +import java.lang.IllegalArgumentException + +object ShellUtils { + + /** + * Escapes the String it's passed so that it will be interpreted literally when + * parsed by sh/bash. Can correctly escape all characters except \0, \r, and \n + * + * Replaces all instances of ' with '\'', and then surrounds the resulting String + * with single quotes. + * + * Examples: + * ab -> 'ab' + * a'b -> 'a'\''b' + * '' -> ''\'''\''' + * + * Since \' is not supported inside single quotes in the shell (ie., '\'' does not work), + * whenever we encounter a single quote we need to terminate the existing single-quoted + * string, place the \' outside of single quotes, and then start a new single-quoted + * string. As long as we don't insert spaces between the separate strings, the shell will + * concatenate them together into a single argument value for us. + * + * @param str the String to escape + * @return the same String quoted so that it will be interpreted literally when + * parsed by sh/bash + */ + def escapeShellArgument ( str : String ) : String = { + if ( str == null ) { + throw new IllegalArgumentException("escapeShellArgument() was passed a null String") + } + + "'" + str.replaceAll("'", "'\\\\''") + "'" + } +} \ No newline at end of file diff --git a/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala new file mode 100644 index 0000000000..eb50c3a2ef --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/function/CommandLineFunctionUnitTest.scala @@ -0,0 +1,171 @@ +package org.broadinstitute.sting.queue.function + +import org.testng.Assert +import org.testng.annotations.{DataProvider, Test} + +// Since "protected" in Scala is subclass-only and doesn't allow package-level access, we need to +// extend a class if we want to test it +class CommandLineFunctionUnitTest extends CommandLineFunction { + def commandLine = "" + + @DataProvider( name="formatArgumentTestData" ) + def formatArgumentDataProvider = { + Array(Array("", "argvalue", "", true, true, "'argvalue'"), + Array("", "argvalue", "", true, false, "argvalue"), + Array("", "argvalue", "", false, true, "'argvalue'"), + Array("", "argvalue", "", false, false, "argvalue"), + Array("-arg", "argvalue", "", true, true, "'-arg' 'argvalue'"), + Array("-arg", "argvalue", "", true, false, "-arg argvalue"), + Array("ARGNAME=", "ARGVALUE", "", false, true, "'ARGNAME=ARGVALUE'"), + Array("ARGNAME=", "ARGVALUE", "", false, false, "ARGNAME=ARGVALUE"), + Array("-Xmx", "4", "G", true, true, "'-Xmx' '4' 'G'"), + Array("-Xmx", "4", "G", true, false, "-Xmx 4 G"), + Array("-Xmx", "4", "G", false, true, "'-Xmx4G'"), + Array("-Xmx", "4", "G", false, false, "-Xmx4G"), + Array("", "", "", true, true, "''"), + Array("", "", "", true, false, ""), + Array("", "", "", false, true, "''"), + Array("", "", "", false, false, ""), + Array("", null, "", true, true, ""), + Array("", Nil, "", true, true, ""), + Array("", None, "", true, true, ""), + Array(null, null, null, true, true, ""), + Array("", Some("argvalue"), "", true, true, "'argvalue'") + ) + } + + @Test( dataProvider="formatArgumentTestData" ) + def testFormatArgument( prefix: String, param: Any, suffix: String, spaceSeparated: Boolean, escape: Boolean, expectedReturnValue: String ) { + Assert.assertEquals(formatArgument(prefix, param, suffix, spaceSeparated, escape, "%s"), + expectedReturnValue) + } + + @Test + def testFormatArgumentCustomFormatString() { + Assert.assertEquals(formatArgument("", "argvalue", "", true, true, "%.3s"), "'arg'") + } + + @DataProvider( name = "requiredTestData" ) + def requiredDataProvider = { + Array(Array("", "argvalue", "", true, true, " 'argvalue' "), + Array("", "argvalue", "", true, false, " argvalue "), + Array("", "argvalue", "", false, true, " 'argvalue' "), + Array("", "argvalue", "", false, false, " argvalue "), + Array("-arg", "argvalue", "", true, true, " '-arg' 'argvalue' "), + Array("-arg", "argvalue", "", true, false, " -arg argvalue "), + Array("ARGNAME=", "ARGVALUE", "", false, true, " 'ARGNAME=ARGVALUE' "), + Array("ARGNAME=", "ARGVALUE", "", false, false, " ARGNAME=ARGVALUE "), + Array("-Xmx", "4", "G", true, true, " '-Xmx' '4' 'G' "), + Array("-Xmx", "4", "G", true, false, " -Xmx 4 G "), + Array("-Xmx", "4", "G", false, true, " '-Xmx4G' "), + Array("-Xmx", "4", "G", false, false, " -Xmx4G "), + Array("", "", "", true, true, " '' "), + Array("", "", "", true, false, " "), + Array("", "", "", false, true, " '' "), + Array("", "", "", false, false, " "), + Array("", null, "", true, true, " "), + Array("", Nil, "", true, true, " "), + Array("", None, "", true, true, " ") + ) + } + + @Test( dataProvider="requiredTestData" ) + def testRequired( prefix: String, param: Any, suffix: String, spaceSeparated: Boolean, escape: Boolean, expectedReturnValue: String ) { + Assert.assertEquals(required(prefix, param, suffix, spaceSeparated, escape), + expectedReturnValue) + } + + @DataProvider( name = "optionalTestData" ) + def optionalDataProvider = { + Array(Array("-arg", "argvalue", "", true, true, " '-arg' 'argvalue' "), + Array("-arg", null, "", true, true, ""), + Array("-arg", Nil, "", true, true, ""), + Array("-arg", None, "", true, true, ""), + Array("-arg", "", "", true, true, " '-arg' ") + ) + } + + @Test( dataProvider="optionalTestData" ) + def testOptional( prefix: String, param: Any, suffix: String, spaceSeparated: Boolean, escape: Boolean, expectedReturnValue: String ) { + Assert.assertEquals(optional(prefix, param, suffix, spaceSeparated, escape), + expectedReturnValue) + } + + @DataProvider( name = "conditionalTestData" ) + def conditionalDataProvider = { + Array(Array(true, "-FLAG", true, " '-FLAG' "), + Array(true, "-FLAG", false, " -FLAG "), + Array(false, "-FLAG", true, ""), + Array(false, "-FLAG", false, ""), + Array(true, null, true, " "), + Array(true, Nil, true, " "), + Array(true, None, true, " "), + Array(false, null, true, ""), + Array(false, Nil, true, ""), + Array(false, None, true, "") + ) + } + + @Test( dataProvider="conditionalTestData" ) + def testConditional( condition: Boolean, param: Any, escape: Boolean, expectedReturnValue: String ) { + Assert.assertEquals(conditional(condition, param, escape), + expectedReturnValue) + } + + @DataProvider( name = "repeatTestData" ) + def repeatDataProvider = { + Array(Array("", List("a", "bc", "d"), "", " ", true, true, " 'a' 'bc' 'd' "), + Array("", List("a", "bc", "d"), "", " ", true, false, " a bc d "), + Array("", List("a", "bc", "d"), "", "", true, true, " 'a''bc''d' "), + Array("", List("a", "bc", "d"), "", "", true, false, " abcd "), + Array("-f", List("file1", "file2", "file3"), "", " ", true, true, " '-f' 'file1' '-f' 'file2' '-f' 'file3' "), + Array("-f", List("file1", "file2", "file3"), "", " ", true, false, " -f file1 -f file2 -f file3 "), + Array("-f", List("file1", "file2", "file3"), "", " ", false, true, " '-ffile1' '-ffile2' '-ffile3' "), + Array("-f", List("file1", "file2", "file3"), "", " ", false, false, " -ffile1 -ffile2 -ffile3 "), + Array("-f", List("file1", "file2", "file3"), "", "", false, true, " '-ffile1''-ffile2''-ffile3' "), + Array("-f", List("file1", "file2", "file3"), "", "", false, false, " -ffile1-ffile2-ffile3 "), + Array("-f", List("file1", "file2", "file3"), "suffix", " ", true, true, " '-f' 'file1' 'suffix' '-f' 'file2' 'suffix' '-f' 'file3' 'suffix' "), + Array("-f", List("file1", "file2", "file3"), "suffix", " ", true, false, " -f file1 suffix -f file2 suffix -f file3 suffix "), + Array("-f", List("file1", "file2", "file3"), "suffix", " ", false, true, " '-ffile1suffix' '-ffile2suffix' '-ffile3suffix' "), + Array("-f", List("file1", "file2", "file3"), "suffix", " ", false, false, " -ffile1suffix -ffile2suffix -ffile3suffix "), + Array("-f", null, "", " ", true, true, ""), + Array("-f", Nil, "", " ", true, true, "") + ) + } + + @Test( dataProvider="repeatTestData" ) + def testRepeat( prefix: String, params: Traversable[_], suffix: String, separator: String, + spaceSeparated: Boolean, escape: Boolean, expectedReturnValue: String ) { + Assert.assertEquals(repeat(prefix, params, suffix, separator, spaceSeparated, escape), + expectedReturnValue) + } + + // Need to test None separately due to implicit conversion issues when using None in a TestNG data provider + @Test + def testRepeatNone() { + testRepeat("", None, "", " ", true, true, "") + } + + @DataProvider( name = "repeatWithPrefixFormattingTestData" ) + def repeatWithPrefixFormattingDataProvider = { + Array(Array("-f", List("file1", "file2", "file3"), "", " ", true, true, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), + " '-f:tagfile1' 'file1' '-f:tagfile2' 'file2' '-f:tagfile3' 'file3' "), + Array("-f", List("file1", "file2", "file3"), "", " ", true, false, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), + " -f:tagfile1 file1 -f:tagfile2 file2 -f:tagfile3 file3 "), + Array("", List("file1", "file2", "file3"), "", " ", true, true, (prefix: String, value: Any) => "-%s".format(value), + " '-file1' 'file1' '-file2' 'file2' '-file3' 'file3' "), + Array("-f", null, "", " ", true, true, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), + ""), + Array("-f", Nil, "", " ", true, true, (prefix: String, value: Any) => "%s:tag%s".format(prefix, value), + "") + ) + } + + @Test( dataProvider = "repeatWithPrefixFormattingTestData" ) + def testRepeatWithPrefixFormatting( prefix: String, params: Traversable[_], suffix: String, separator: String, + spaceSeparated: Boolean, escape: Boolean, formatPrefix: (String, Any) => String, + expectedReturnValue: String ) { + Assert.assertEquals(repeat(prefix, params, suffix, separator, spaceSeparated, escape, "%s", formatPrefix), + expectedReturnValue) + } +} \ No newline at end of file diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala new file mode 100644 index 0000000000..7e1d09b700 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -0,0 +1,69 @@ +package org.broadinstitute.sting.queue.pipeline + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.BaseTest + +class DataProcessingPipelineTest { + @Test + def testSimpleBAM { + val projectName = "test1" + val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" + val spec = new PipelineTestSpec + spec.name = "DataProcessingPipeline" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -i " + BaseTest.testDir + "exampleBAM.bam", + " -D " + BaseTest.testDir + "exampleDBSNP.vcf", + " -nv ", + " -test ", + " -p " + projectName).mkString + spec.fileMD5s += testOut -> "1f85e76de760167a77ed1d9ab4da2936" + PipelineTest.executeTest(spec) + } + + @Test + def testBWAPEBAM { + val projectName = "test2" + val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" + val spec = new PipelineTestSpec + spec.name = "DataProcessingPipeline" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -i " + BaseTest.testDir + "exampleBAM.bam", + " -D " + BaseTest.testDir + "exampleDBSNP.vcf", + " -nv ", + " -test ", + " -bwa /home/unix/carneiro/bin/bwa", + " -bwape ", + " -p " + projectName).mkString + spec.fileMD5s += testOut -> "57416a0abdf9524bc92834d466529708" + PipelineTest.executeTest(spec) + } + +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala new file mode 100644 index 0000000000..1278b6e163 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.queue.pipeline + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.BaseTest + +class PacbioProcessingPipelineTest { + @Test + def testPacbioProcessingPipeline { + val testOut = "exampleBAM.recal.bam" + val spec = new PipelineTestSpec + spec.name = "pacbioProcessingPipeline" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -i " + BaseTest.testDir + "exampleBAM.bam", + " -blasr ", + " -test ", + " -D " + BaseTest.testDir + "exampleDBSNP.vcf").mkString + spec.fileMD5s += testOut -> "cf147e7f56806598371f8d5d6794b852" + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala index 5901cab467..e737e52ea0 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala @@ -30,7 +30,7 @@ import org.broadinstitute.sting.BaseTest class ExampleCountLociPipelineTest { @Test - def testCountLoci { + def testCountLoci() { val testOut = "count.out" val spec = new PipelineTestSpec spec.name = "countloci" @@ -39,7 +39,7 @@ class ExampleCountLociPipelineTest { " -R " + BaseTest.testDir + "exampleFASTA.fasta", " -I " + BaseTest.testDir + "exampleBAM.bam", " -o " + testOut).mkString - spec.fileMD5s += testOut -> "67823e4722495eb10a5e4c42c267b3a6" + spec.fileMD5s += testOut -> "ade93df31a6150321c1067e749cae9be" PipelineTest.executeTest(spec) } -} +} \ No newline at end of file diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala new file mode 100644 index 0000000000..8b286f0901 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleCountReadsPipelineTest { + @Test + def testCountReads() { + val spec = new PipelineTestSpec + spec.name = "countreads" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam").mkString + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala new file mode 100644 index 0000000000..d50673a1a9 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleUnifiedGenotyperPipelineTest { + @Test + def testUnifiedGenotyper() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam", + " -filter QD", + " -filterExpression 'QD < 2.0'").mkString + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala new file mode 100644 index 0000000000..76585d207f --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/util/ShellUtilsUnitTest.scala @@ -0,0 +1,57 @@ +package org.broadinstitute.sting.queue.util + +import org.testng.annotations.Test +import org.testng.Assert +import java.io.{InputStreamReader, BufferedReader} + +class ShellUtilsUnitTest { + + @Test + def testEscapeShellArgumentOneCharSequences() { + // Test all ASCII characters except \0, \n, and \r, which we do not support escaping + for ( asciiCode <- 1 to 127 if asciiCode != 10 && asciiCode != 13 ) { + val originalString: String = "%c".format(asciiCode.toChar) + val quotedString: String = ShellUtils.escapeShellArgument(originalString) + + val child : Process = new ProcessBuilder("/bin/sh", "-c", "printf \"%s\" " + quotedString).start() + val childReader : BufferedReader = new BufferedReader(new InputStreamReader(child.getInputStream)) + val childOutputBuffer : StringBuilder = new StringBuilder + + val childReaderThread : Thread = new Thread(new Runnable() { + def run() { + var line : String = childReader.readLine() + + while ( line != null ) { + childOutputBuffer.append(line) + line = childReader.readLine() + } + } + }) + childReaderThread.start() + + val childReturnValue = child.waitFor() + childReaderThread.join() + + childReader.close() + val childOutput = childOutputBuffer.toString() + + if ( childReturnValue != 0 ) { + Assert.fail("With character ASCII %d, sh child process returned: %d".format(asciiCode, childReturnValue)) + } + else if ( ! originalString.equals(childOutput) ) { + Assert.fail("With character ASCII %d, sh child process output \"%s\" instead of the expected \"%s\"".format( + asciiCode, childOutput, originalString)) + } + } + } + + @Test(expectedExceptions = Array(classOf[IllegalArgumentException])) + def testEscapeShellArgumentNullString() { + ShellUtils.escapeShellArgument(null) + } + + @Test + def testEscapeShellArgumentEmptyString() { + Assert.assertEquals(ShellUtils.escapeShellArgument(""), "''") + } +} \ No newline at end of file diff --git a/public/testdata/exampleBAM.bam b/public/testdata/exampleBAM.bam index a6ebb6fd1e..319dd1a72d 100644 Binary files a/public/testdata/exampleBAM.bam and b/public/testdata/exampleBAM.bam differ diff --git a/public/testdata/exampleBAM.bam.bai b/public/testdata/exampleBAM.bam.bai index cc6e1a1456..052ac614bd 100644 Binary files a/public/testdata/exampleBAM.bam.bai and b/public/testdata/exampleBAM.bam.bai differ diff --git a/public/testdata/exampleDBSNP.vcf b/public/testdata/exampleDBSNP.vcf new file mode 100644 index 0000000000..9e7e96f512 --- /dev/null +++ b/public/testdata/exampleDBSNP.vcf @@ -0,0 +1,282 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=5% minor allele frequency in 1+ populations"> +##INFO=5% minor allele frequency in each and all populations"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=SubSNP->Batch.link_out"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##LeftAlignVariants="analysis_type=LeftAlignVariants input_file=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false disable_experimental_low_memory_sharding=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=00-All.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub filter_mismatching_base_and_quals=false" +##contig= +##phasing=partial +##reference=GRCh37.3 +##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta +##source=dbSNP +##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 10144 rs144773400 TA T . PASS ASP;RSPOS=10145;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10228 rs143255646 TA T . PASS ASP;RSPOS=10229;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10234 rs145599635 C T . PASS ASP;RSPOS=10234;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 10248 rs148908337 A T . PASS ASP;RSPOS=10248;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 10254 rs140194106 TA T . PASS ASP;RSPOS=10255;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10291 rs145427775 C T . PASS ASP;RSPOS=10291;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 10327 rs112750067 T C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10327;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 +chr1 10329 rs150969722 AC A . PASS ASP;RSPOS=10330;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10351 rs145072688 CTA C,CA . PASS ASP;RSPOS=10352;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10382 rs147093981 AAC A,AC . PASS ASP;RSPOS=10383;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10433 rs56289060 A AC . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10433;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 10439 rs112766696 AC A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10440;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=132 +chr1 10439 rs138941843 AC A . PASS ASP;RSPOS=10440;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10440 rs112155239 C A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10440;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 +chr1 10492 rs55998931 C T . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.0617001828153565;RSPOS=10492;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040000000100;WGT=0;dbSNPBuildID=129 +chr1 10519 rs62636508 G C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10519;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=129 +chr1 10583 rs58108140 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.270566727605119;KGPilot123;RSPOS=10583;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040010000100;WGT=0;dbSNPBuildID=129 +chr1 10611 rs189107123 C G . PASS KGPilot123;RSPOS=10611;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 10828 rs10218492 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10828;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119 +chr1 10904 rs10218493 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10904;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=119 +chr1 10927 rs10218527 A G . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10927;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119 +chr1 10938 rs28853987 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10938;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 +chr1 11014 rs28484712 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11014;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 +chr1 11022 rs28775022 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11022;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 +chr1 11081 rs10218495 G T . PASS CFL;GENEINFO=LOC100652771:100652771;GNO;RSPOS=11081;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=119 +chr1 11863 rs187669455 C A . PASS RSPOS=11863;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=135 +chr1 13302 rs180734498 C T . PASS KGPilot123;RSPOS=13302;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 13327 rs144762171 G C . PASS ASP;KGPilot123;RSPOS=13327;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 13684 rs71260404 C T . PASS GENEINFO=LOC100652771:100652771;GNO;RSPOS=13684;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 13980 rs151276478 T C . PASS ASP;KGPilot123;RSPOS=13980;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 14889 rs142444908 G A . PASS ASP;RSPOS=14889;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 14907 rs79585140 A G . PASS GNO;RSPOS=14907;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131 +chr1 14930 rs75454623 A G . PASS GNO;RSPOS=14930;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131 +chr1 14976 rs71252251 G A . PASS ASP;GNO;RSPOS=14976;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=130 +chr1 15061 rs71268703 T TG . PASS ASP;GNO;RSPOS=15061;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130 +chr1 15118 rs71252250 A G . PASS ASP;GNO;RSPOS=15118;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=130 +chr1 15211 rs144718396 T G . PASS ASP;RSPOS=15211;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 15211 rs78601809 T G . PASS ASP;GNO;RSPOS=15211;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 16257 rs78588380 G C . PASS ASP;GNO;RSPOS=16257;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131 +chr1 16378 rs148220436 T C . PASS ASP;RSPOS=16378;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 16495 rs141130360 G C . PASS ASP;RSPOS=16495;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 16497 rs150723783 A G . PASS ASP;RSPOS=16497;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 17519 rs192890528 G T . PASS RSPOS=17519;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=135 +chr1 19226 rs138930629 T A . PASS ASP;RSPOS=19226;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 20141 rs56336884 G A . PASS HD;RSPOS=20141;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000400000100;WGT=0;dbSNPBuildID=129 +chr1 20144 rs143346096 G A . PASS ASP;RSPOS=20144;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 20206 rs71262675 C T . PASS GNO;RSPOS=20206;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 20245 rs71262674 G A . PASS GMAF=0.256398537477148;GNO;RSPOS=20245;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 20304 rs71262673 G C . PASS GMAF=0.338208409506399;GNO;RSPOS=20304;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 26999 rs147506580 A G . PASS ASP;RSPOS=26999;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 29436 rs2462493 G A . PASS GNO;RSPOS=29436;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=100 +chr1 30923 rs140337953 G T . PASS ASP;KGPilot123;RSPOS=30923;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 33487 rs77459554 C T . PASS ASP;GNO;RSPOS=33487;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131 +chr1 33495 rs75468675 C T . PASS ASP;GNO;RSPOS=33495;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 33505 rs75627161 T C . PASS ASP;GNO;RSPOS=33505;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 33508 rs75609629 A T . PASS ASP;GNO;RSPOS=33508;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 33521 rs76098219 T A . PASS GNO;RSPOS=33521;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131 +chr1 33593 rs557585 G A . PASS RSPOS=33593;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=83 +chr1 33648 rs62028204 G T . PASS RSPOS=33648;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=129 +chr1 33656 rs113821789 T C . PASS RSPOS=33656;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=132 +chr1 51476 rs187298206 T C . PASS KGPilot123;RSPOS=51476;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 51479 rs116400033 T A . PASS ASP;G5;G5A;GMAF=0.113802559414991;KGPilot123;RSPOS=51479;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004070010000100;WGT=0;dbSNPBuildID=132 +chr1 51803 rs62637812 T C . PASS GMAF=0.468921389396709;RSPOS=51803;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040000000100;WGT=0;dbSNPBuildID=129 +chr1 51898 rs76402894 C A . PASS GMAF=0.0731261425959781;GNO;RSPOS=51898;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=131 +chr1 51914 rs190452223 T G . PASS KGPilot123;RSPOS=51914;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 51928 rs78732933 G A . PASS GNO;RSPOS=51928;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=131 +chr1 51935 rs181754315 C T . PASS KGPilot123;RSPOS=51935;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 51954 rs185832753 G C . PASS KGPilot123;RSPOS=51954;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 52058 rs62637813 G C . PASS GMAF=0.0342778793418647;KGPilot123;RSPOS=52058;SAO=0;SSR=1;VC=SNV;VLD;VP=050000000000040010000140;WGT=0;dbSNPBuildID=129 +chr1 52144 rs190291950 T A . PASS KGPilot123;RSPOS=52144;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 52238 rs150021059 T G . PASS ASP;KGPilot123;RSPOS=52238;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54353 rs140052487 C A . PASS ASP;KGPilot123;RSPOS=54353;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54421 rs146477069 A G . PASS ASP;KGPilot123;RSPOS=54421;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54490 rs141149254 G A . PASS ASP;KGPilot123;RSPOS=54490;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54676 rs2462492 C T . PASS ASP;GMAF=0.191956124314442;GNO;HD;KGPilot123;RSPOS=54676;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040510000100;WGT=0;dbSNPBuildID=100 +chr1 54753 rs143174675 T G . PASS ASP;KGPilot123;RSPOS=54753;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54788 rs59861892 CC C,CCT . PASS ASP;RSPOS=54789;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 54795 rs58014817 T A . PASS ASP;RSPOS=54795;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=129 +chr1 55164 rs3091274 C A . PASS G5;G5A;GMAF=0.145338208409506;GNO;KGPilot123;RSPOS=55164;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000030110000100;WGT=0;dbSNPBuildID=103 +chr1 55299 rs10399749 C T . PASS G5;G5A;GMAF=0.278793418647166;GNO;KGPilot123;PH2;RSPOS=55299;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000030112000100;WGT=0;dbSNPBuildID=119 +chr1 55302 rs3091273 C T . PASS RSPOS=55302;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=103 +chr1 55313 rs182462964 A T . PASS KGPilot123;RSPOS=55313;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55322 rs3107974 C T . PASS RSPOS=55322;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=103 +chr1 55326 rs3107975 T C . PASS GNO;HD;KGPilot123;RSPOS=55326;SAO=0;SSR=0;VC=SNV;VP=050000000000000510000100;WGT=0;dbSNPBuildID=103 +chr1 55330 rs185215913 G A . PASS KGPilot123;RSPOS=55330;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55367 rs190850374 G A . PASS KGPilot123;RSPOS=55367;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55388 rs182711216 C T . PASS KGPilot123;RSPOS=55388;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55394 rs2949420 T A . PASS GNO;KGPilot123;PH2;RSPOS=55394;SAO=0;SSR=0;VC=SNV;VP=050000000000000112000100;WGT=0;dbSNPBuildID=101 +chr1 55416 rs193242050 G A . PASS KGPilot123;RSPOS=55416;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55427 rs183189405 T C . PASS KGPilot123;RSPOS=55427;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55545 rs28396308 C T . PASS GNO;RSPOS=55545;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=125 +chr1 55816 rs187434873 G A . PASS KGPilot123;RSPOS=55816;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55850 rs191890754 C G . PASS KGPilot123;RSPOS=55850;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55852 rs184233019 G C . PASS KGPilot123;RSPOS=55852;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 56644 rs143342222 A C . PASS ASP;KGPilot123;RSPOS=56644;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 57952 rs189727433 A C . PASS KGPilot123;RSPOS=57952;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 58771 rs140128481 T C . PASS ASP;RSPOS=58771;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 58814 rs114420996 G A . PASS ASP;G5;GMAF=0.0982632541133455;KGPilot123;RSPOS=58814;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004050010000100;WGT=0;dbSNPBuildID=132 +chr1 59040 rs149755937 T C . PASS ASP;KGPilot123;RSPOS=59040;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 60718 rs78395614 G A . PASS CFL;GNO;RSPOS=60718;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 60726 rs192328835 C A . PASS KGPilot123;RSPOS=60726;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 60791 rs76199781 A G . PASS CFL;GNO;RSPOS=60791;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 61442 rs74970982 A G . PASS CFL;GMAF=0.076782449725777;GNO;KGPilot123;RSPOS=61442;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131 +chr1 61462 rs56992750 T A . PASS CFL;G5;G5A;GMAF=0.0383912248628885;GNO;KGPilot123;RSPOS=61462;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008030110000100;WGT=0;dbSNPBuildID=129 +chr1 61480 rs75526266 G C . PASS CFL;GNO;RSPOS=61480;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 61499 rs75719746 G A . PASS CFL;GNO;RSPOS=61499;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 61743 rs184286948 G C . PASS KGPilot123;RSPOS=61743;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 61920 rs62637820 G A . PASS CFL;GMAF=0.0255941499085923;RSPOS=61920;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=129 +chr1 61987 rs76735897 A G . PASS CFL;GMAF=0.292961608775137;GNO;KGPilot123;RSPOS=61987;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131 +chr1 61989 rs77573425 G C . PASS CFL;GMAF=0.309414990859232;GNO;KGPilot123;RSPOS=61989;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131 +chr1 61993 rs190553843 C T . PASS KGPilot123;RSPOS=61993;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 62156 rs181864839 C T . PASS KGPilot123;RSPOS=62156;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 62157 rs10399597 G A . PASS CFL;GMAF=0.00228519195612431;KGPilot123;RSPOS=62157;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=119 +chr1 62162 rs140556834 G A . PASS ASP;KGPilot123;RSPOS=62162;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 62203 rs28402963 T C . PASS CFL;KGPilot123;RSPOS=62203;SAO=0;SSR=0;VC=SNV;VP=050000000008000010000100;WGT=0;dbSNPBuildID=125 +chr1 62271 rs28599927 A G . PASS CFL;GMAF=0.138482632541133;RSPOS=62271;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=125 +chr1 63268 rs75478250 T C . PASS CFL;GNO;RSPOS=63268;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 63276 rs185977555 G A . PASS KGPilot123;RSPOS=63276;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 63297 rs188886746 G A . PASS KGPilot123;RSPOS=63297;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 63671 rs116440577 G A . PASS ASP;G5;GMAF=0.170018281535649;KGPilot123;RSPOS=63671;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004050010000100;WGT=0;dbSNPBuildID=132 +chr1 63737 rs77426996 TACT T,TCTA . PASS CFL;RSPOS=63738;SAO=0;SSR=0;VC=DIV;VP=050000000008000000000200;WGT=0;dbSNPBuildID=131 +chr1 64649 rs181431124 A C . PASS KGPilot123;RSPOS=64649;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66008 rs2691286 C G . PASS CFL;GNO;RSPOS=66008;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=100 +chr1 66162 rs62639105 A T . PASS CFL;GMAF=0.320383912248629;GNO;KGPilot123;RSPOS=66162;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000110000100;WGT=0;dbSNPBuildID=129 +chr1 66176 rs28552463 T A . PASS CFL;GMAF=0.0484460694698355;KGPilot123;RSPOS=66176;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=125 +chr1 66219 rs181028663 A T . PASS KGPilot123;RSPOS=66219;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66238 rs113961546 T A . PASS CFL;GNO;RSPOS=66238;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=132 +chr1 66314 rs28534012 T A . PASS CFL;RSPOS=66314;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=125 +chr1 66331 rs186063952 A C . PASS KGPilot123;RSPOS=66331;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66334 rs28464214 T A . PASS CFL;RSPOS=66334;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=125 +chr1 66442 rs192044252 T A . PASS KGPilot123;RSPOS=66442;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66457 rs13328655 T A . PASS CFL;GMAF=0.0795246800731261;KGPilot123;RSPOS=66457;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=121 +chr1 66503 rs112350669 T A . PASS CFL;RSPOS=66503;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=132 +chr1 66507 rs12401368 T A . PASS CFL;GMAF=0.479890310786106;KGPilot123;RSPOS=66507;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=120 +chr1 66651 rs2257270 A T . PASS CFL;GNO;RSPOS=66651;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=100 +chr1 67179 rs149952626 C G . PASS ASP;KGPilot123;RSPOS=67179;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 67181 rs77662731 A G . PASS ASP;G5;G5A;GENEINFO=OR4F5:79501;GMAF=0.0470749542961609;GNO;KGPilot123;RSPOS=67181;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004070110000100;WGT=0;dbSNPBuildID=131 +chr1 67223 rs78676975 C A . PASS ASP;GENEINFO=OR4F5:79501;GNO;RSPOS=67223;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131 +chr1 69428 rs140739101 T G . PASS ASP;RSPOS=69428;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134 +chr1 69453 rs142004627 G A . PASS ASP;RSPOS=69453;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 69476 rs148502021 T C . PASS ASP;RSPOS=69476;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134 +chr1 69496 rs150690004 G A . PASS ASP;RSPOS=69496;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134 +chr1 69511 rs75062661 A G . PASS GENEINFO=OR4F5:79501;GMAF=0.193784277879342;GNO;KGPilot123;RSPOS=69511;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000000000110000100;WGT=0;dbSNPBuildID=131 +chr1 69534 rs190717287 T C . PASS KGPilot123;RSPOS=69534;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 69552 rs55874132 G C . PASS GENEINFO=OR4F5:79501;HD;RSPOS=69552;S3D;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050300000000040400000100;WGT=0;dbSNPBuildID=129 +chr1 69590 rs141776804 T A . PASS ASP;RSPOS=69590;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 69594 rs144967600 T C . PASS ASP;RSPOS=69594;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 72148 rs182862337 C T . PASS KGPilot123;RSPOS=72148;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 73841 rs143773730 C T . PASS ASP;KGPilot123;RSPOS=73841;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 74651 rs62641291 G A . PASS RSPOS=74651;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=129 +chr1 74681 rs13328683 G T . PASS CFL;GMAF=0.286106032906764;RSPOS=74681;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=121 +chr1 74709 rs62641292 T A . PASS CFL;RSPOS=74709;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=129 +chr1 74771 rs13328675 A G . PASS CFL;RSPOS=74771;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121 +chr1 74790 rs13328700 C G . PASS CFL;RSPOS=74790;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121 +chr1 74792 rs13328684 G A . PASS CFL;RSPOS=74792;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121 +chr1 77462 rs188023513 G A . PASS KGPilot123;RSPOS=77462;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 77470 rs192898053 T C . PASS KGPilot123;RSPOS=77470;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 77874 rs184538873 G A . PASS KGPilot123;RSPOS=77874;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 77961 rs78385339 G A . PASS GMAF=0.125685557586837;KGPilot123;RSPOS=77961;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040010000100;WGT=0;dbSNPBuildID=131 +chr1 79033 rs62641298 A G . PASS GMAF=0.438299817184644;GNO;HD;KGPilot123;RSPOS=79033;SAO=0;SSR=0;VC=SNV;VP=050000000000000510000100;WGT=0;dbSNPBuildID=129 +chr1 79050 rs62641299 G T . PASS GMAF=0.224405850091408;GNO;KGPilot123;RSPOS=79050;SAO=0;SSR=0;VC=SNV;VP=050000000000000110000100;WGT=0;dbSNPBuildID=129 +chr1 79137 rs143777184 A T . PASS ASP;KGPilot123;RSPOS=79137;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 79417 rs184768190 C T . PASS KGPilot123;RSPOS=79417;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 79418 rs2691296 G C . PASS GMAF=0.0178244972577697;RSPOS=79418;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000000040000000100;WGT=0;dbSNPBuildID=100 +chr1 79538 rs2691295 C T . PASS RSPOS=79538;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=100 +chr1 79772 rs147215883 C G . PASS ASP;KGPilot123;RSPOS=79772;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 79872 rs189224661 T G . PASS KGPilot123;RSPOS=79872;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 80323 rs3942603 G C . PASS CFL;GNO;RSPOS=80323;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=108 +chr1 80386 rs3878915 C A . PASS GMAF=0.0118829981718464;RSPOS=80386;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000000040000000100;WGT=0;dbSNPBuildID=108 +chr1 80454 rs144226842 G C . PASS ASP;KGPilot123;RSPOS=80454;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 81836 rs2259560 A T . PASS ASP;GNO;RSPOS=81836;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=100 +chr1 81949 rs181567186 T C . PASS KGPilot123;RSPOS=81949;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 81962 rs4030308 T TAA . PASS ASP;RSPOS=81962;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108 +chr1 82102 rs4030307 C T . PASS ASP;RSPOS=82102;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108 +chr1 82103 rs2020400 T C . PASS ASP;RSPOS=82103;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=92 +chr1 82126 rs1815133 C T . PASS ASP;RSPOS=82126;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=92 +chr1 82133 rs4030306 CA C,CAAAAAAAAAAAAAAA . PASS ASP;RSPOS=82136;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108 +chr1 82154 rs4477212 A G . PASS ASP;HD;RSPOS=82154;SAO=0;SSR=0;VC=SNV;VP=050000000004000400000100;WGT=0;dbSNPBuildID=111 +chr1 82162 rs1815132 C A . PASS ASP;GMAF=0.0351919561243144;GNO;RSPOS=82162;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=92 +chr1 82163 rs139113303 G A . PASS ASP;KGPilot123;RSPOS=82163;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 82196 rs112844054 A T . PASS ASP;RSPOS=82196;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 +chr1 82249 rs1851945 A G . PASS ASP;GMAF=0.0452468007312614;KGPilot123;RSPOS=82249;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000004040010000100;WGT=0;dbSNPBuildID=92 +chr1 82282 rs3871775 G A . PASS ASP;RSPOS=82282;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108 +chr1 82303 rs3871776 T C . PASS ASP;RSPOS=82303;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108 +chr1 82316 rs4030305 A C . PASS ASP;GNO;RSPOS=82316;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=108 +chr1 82609 rs149189449 C G . PASS ASP;KGPilot123;RSPOS=82609;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 82676 rs185237834 T G . PASS KGPilot123;RSPOS=82676;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 82734 rs4030331 T C . PASS ASP;GMAF=0.261882998171846;KGPilot123;RSPOS=82734;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000004040010000100;WGT=0;dbSNPBuildID=108 +chr1 82957 rs189774606 C T . PASS KGPilot123;RSPOS=82957;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83084 rs181193408 T A . PASS KGPilot123;RSPOS=83084;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83088 rs186081601 G C . PASS KGPilot123;RSPOS=83088;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83107 rs4405097 G C . PASS ASP;RSPOS=83107;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=111 +chr1 83119 rs4030324 AA A,ATAAC . PASS ASP;RSPOS=83120;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108 +chr1 83771 rs189906733 T G . PASS KGPilot123;RSPOS=83771;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83786 rs58520670 T TA . PASS ASP;RSPOS=83794;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83815 rs58857344 GAGAA G . PASS ASP;RSPOS=83827;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83826 rs71281475 AAAGA A,AAA . PASS ASP;GNO;RSPOS=83827;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130 +chr1 83855 rs59596480 GAA G . PASS ASP;RSPOS=83857;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83872 rs59556914 AA A,AAGA . PASS ASP;RSPOS=83873;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83884 rs59586754 GAAA G . PASS ASP;RSPOS=83885;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83897 rs61330047 GAA G . PASS ASP;RSPOS=83899;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83901 rs58254183 GAAAGAA G . PASS ASP;RSPOS=83903;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83921 rs61338823 GAA G . PASS ASP;RSPOS=83923;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83930 rs71281474 AG A,AGA . PASS ASP;GNO;RSPOS=83931;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130 +chr1 83934 rs59235392 AG A,AGAAA . PASS ASP;RSPOS=83935;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83977 rs180759811 A G . PASS KGPilot123;RSPOS=83977;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84002 rs28850140 G A . PASS ASP;GMAF=0.138939670932358;KGPilot123;RSPOS=84002;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040010000100;WGT=0;dbSNPBuildID=125 +chr1 84010 rs186443818 G A . PASS KGPilot123;RSPOS=84010;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84018 rs61352176 GAA G . PASS ASP;RSPOS=84020;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 84079 rs190867312 T C . PASS KGPilot123;RSPOS=84079;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84139 rs183605470 A T . PASS KGPilot123;RSPOS=84139;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84156 rs188652299 A C . PASS KGPilot123;RSPOS=84156;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84244 rs191297051 A C . PASS KGPilot123;RSPOS=84244;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84295 rs183209871 G A . PASS KGPilot123;RSPOS=84295;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84346 rs187855973 T C . PASS KGPilot123;RSPOS=84346;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84453 rs191379015 C G . PASS KGPilot123;RSPOS=84453;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84705 rs183470350 T G . PASS KGPilot123;RSPOS=84705;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 diff --git a/public/testdata/exampleFASTA.fasta.amb b/public/testdata/exampleFASTA.fasta.amb new file mode 100644 index 0000000000..986e6d6034 --- /dev/null +++ b/public/testdata/exampleFASTA.fasta.amb @@ -0,0 +1 @@ +100000 1 0 diff --git a/public/testdata/exampleFASTA.fasta.ann b/public/testdata/exampleFASTA.fasta.ann new file mode 100644 index 0000000000..642ddb6d75 --- /dev/null +++ b/public/testdata/exampleFASTA.fasta.ann @@ -0,0 +1,3 @@ +100000 1 11 +0 chr1 (null) +0 100000 0 diff --git a/public/testdata/exampleFASTA.fasta.bwt b/public/testdata/exampleFASTA.fasta.bwt new file mode 100644 index 0000000000..fe74222804 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.bwt differ diff --git a/public/testdata/exampleFASTA.fasta.pac b/public/testdata/exampleFASTA.fasta.pac new file mode 100644 index 0000000000..b0f55c0c4d Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.pac differ diff --git a/public/testdata/exampleFASTA.fasta.rbwt b/public/testdata/exampleFASTA.fasta.rbwt new file mode 100644 index 0000000000..f623b8c394 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.rbwt differ diff --git a/public/testdata/exampleFASTA.fasta.rpac b/public/testdata/exampleFASTA.fasta.rpac new file mode 100644 index 0000000000..b88ff49eb9 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.rpac differ diff --git a/public/testdata/exampleFASTA.fasta.rsa b/public/testdata/exampleFASTA.fasta.rsa new file mode 100644 index 0000000000..6e7e213df6 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.rsa differ diff --git a/public/testdata/exampleFASTA.fasta.sa b/public/testdata/exampleFASTA.fasta.sa new file mode 100644 index 0000000000..d6db971b7d Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.sa differ diff --git a/public/testdata/overlapTest.bed b/public/testdata/overlapTest.bed new file mode 100644 index 0000000000..6859f1fdc9 --- /dev/null +++ b/public/testdata/overlapTest.bed @@ -0,0 +1,3 @@ +20 315000 315100 # should overlap 2 in withSymbolic.vcf at 315006 and 315072 +20 316955 316959 # should overlap only deletion variant at 316952 +20 317900 400000 # should overlap only the symbolic variant at 317173 diff --git a/public/testdata/withSymbolic.b37.vcf b/public/testdata/withSymbolic.b37.vcf new file mode 100644 index 0000000000..4974f12296 --- /dev/null +++ b/public/testdata/withSymbolic.b37.vcf @@ -0,0 +1,96 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 315006 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=0.998;BRSQ=0.721;LDAF=0.0025;AVGPOST=0.9978;RSQ=0.6449;ERATE=0.0008;THETA=0.0006;AC=4;AN=2184 +20 315072 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.996;LDAF=0.0057;AVGPOST=0.9996;RSQ=0.9743;ERATE=0.0003;THETA=0.0016;AC=12;AN=2184 +20 315162 . T G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.950;LDAF=0.0005;AVGPOST=0.9998;RSQ=0.8078;ERATE=0.0003;THETA=0.0004;AC=1;AN=2184 +20 315168 rs71327439 G GA 7142 PASS INDEL;BAVGPOST=0.999;BRSQ=0.990;LDAF=0.0575;AVGPOST=0.9985;RSQ=0.9891;ERATE=0.0003;THETA=0.0004;AC=125;AN=2184 +20 315201 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.965;LDAF=0.0005;AVGPOST=0.9999;RSQ=0.8599;ERATE=0.0003;THETA=0.0008;AC=1;AN=2184 +20 315214 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.974;LDAF=0.0009;AVGPOST=0.9990;RSQ=0.5860;ERATE=0.0003;THETA=0.0005;AC=1;AN=2184 +20 315270 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0557;AVGPOST=0.9992;RSQ=0.9950;ERATE=0.0003;THETA=0.0004;AC=121;AN=2184 +20 315279 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.991;LDAF=0.0572;AVGPOST=0.9990;RSQ=0.9926;ERATE=0.0003;THETA=0.0013;AC=125;AN=2184 +20 315320 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.995;LDAF=0.0028;AVGPOST=0.9998;RSQ=0.9594;ERATE=0.0005;THETA=0.0003;AC=6;AN=2184 +20 315322 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.814;LDAF=0.0007;AVGPOST=0.9994;RSQ=0.6510;ERATE=0.0003;THETA=0.0004;AC=1;AN=2184 +20 315431 . A T 100 PASS LCSNP;EXSNP;BAVGPOST=0.987;BRSQ=0.864;LDAF=0.0431;AVGPOST=0.9873;RSQ=0.8675;ERATE=0.0006;THETA=0.0010;AC=86;AN=2184 +20 315481 . T A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.998;LDAF=0.0619;AVGPOST=0.9993;RSQ=0.9948;ERATE=0.0003;THETA=0.0007;AC=135;AN=2184 +20 315490 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.984;LDAF=0.0138;AVGPOST=0.9971;RSQ=0.9260;ERATE=0.0003;THETA=0.0046;AC=31;AN=2184 +20 315523 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.822;LDAF=0.0015;AVGPOST=0.9988;RSQ=0.6777;ERATE=0.0005;THETA=0.0003;AC=2;AN=2184 +20 315547 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.893;LDAF=0.0026;AVGPOST=0.9995;RSQ=0.9024;ERATE=0.0003;THETA=0.0004;AC=5;AN=2184 +20 315549 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.888;LDAF=0.0029;AVGPOST=0.9988;RSQ=0.8565;ERATE=0.0003;THETA=0.0013;AC=5;AN=2184 +20 315551 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.829;LDAF=0.0008;AVGPOST=0.9992;RSQ=0.5723;ERATE=0.0003;THETA=0.0012;AC=1;AN=2184 +20 315704 . G C 100 PASS LCSNP;EXSNP;BAVGPOST=0.998;BRSQ=0.945;LDAF=0.0184;AVGPOST=0.9978;RSQ=0.9523;ERATE=0.0003;THETA=0.0017;AC=40;AN=2184 +20 315798 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.773;LDAF=0.0012;AVGPOST=0.9985;RSQ=0.4929;ERATE=0.0003;THETA=0.0005;AC=1;AN=2184 +20 315842 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.979;LDAF=0.0110;AVGPOST=0.9991;RSQ=0.9673;ERATE=0.0003;THETA=0.0005;AC=23;AN=2184 +20 315876 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.926;LDAF=0.0018;AVGPOST=0.9988;RSQ=0.7731;ERATE=0.0003;THETA=0.0007;AC=3;AN=2184 +20 316028 . G C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0714;AVGPOST=0.9997;RSQ=0.9982;ERATE=0.0003;THETA=0.0003;AC=156;AN=2184 +20 316055 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.997;LDAF=0.1006;AVGPOST=0.9993;RSQ=0.9969;ERATE=0.0003;THETA=0.0007;AC=220;AN=2184 +20 316137 . G C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.976;LDAF=0.0037;AVGPOST=0.9982;RSQ=0.7861;ERATE=0.0004;THETA=0.0009;AC=6;AN=2184 +20 316142 . C A 100 PASS LCSNP;EXSNP;BAVGPOST=0.998;BRSQ=0.793;LDAF=0.0033;AVGPOST=0.9980;RSQ=0.7527;ERATE=0.0003;THETA=0.0003;AC=6;AN=2184 +20 316143 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.839;LDAF=0.0034;AVGPOST=0.9984;RSQ=0.8054;ERATE=0.0003;THETA=0.0003;AC=6;AN=2184 +20 316211 . C G 100 PASS LCSNP;EXSNP;BAVGPOST=0.997;BRSQ=0.976;LDAF=0.0565;AVGPOST=0.9983;RSQ=0.9872;ERATE=0.0003;THETA=0.0010;AC=124;AN=2184 +20 316285 . A AT 5514 PASS INDEL;BAVGPOST=0.999;BRSQ=0.993;LDAF=0.0552;AVGPOST=0.9978;RSQ=0.9829;ERATE=0.0004;THETA=0.0005;AC=119;AN=2184 +20 316295 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.808;LDAF=0.0021;AVGPOST=0.9980;RSQ=0.6390;ERATE=0.0003;THETA=0.0008;AC=4;AN=2184 +20 316481 . G T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=1.000;LDAF=0.0499;AVGPOST=0.9997;RSQ=0.9970;ERATE=0.0003;THETA=0.0007;AC=109;AN=2184 +20 316488 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.897;LDAF=0.0011;AVGPOST=0.9997;RSQ=0.8509;ERATE=0.0003;THETA=0.0006;AC=2;AN=2184 +20 316553 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.945;LDAF=0.0007;AVGPOST=0.9996;RSQ=0.7074;ERATE=0.0003;THETA=0.0004;AC=1;AN=2184 +20 316659 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.998;LDAF=0.0497;AVGPOST=0.9995;RSQ=0.9960;ERATE=0.0003;THETA=0.0007;AC=109;AN=2184 +20 316691 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.985;LDAF=0.0058;AVGPOST=0.9989;RSQ=0.9301;ERATE=0.0003;THETA=0.0003;AC=13;AN=2184 +20 316700 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.737;LDAF=0.0030;AVGPOST=0.9971;RSQ=0.5700;ERATE=0.0009;THETA=0.0016;AC=4;AN=2184 +20 316725 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.853;LDAF=0.0011;AVGPOST=0.9995;RSQ=0.7944;ERATE=0.0003;THETA=0.0004;AC=2;AN=2184 +20 316770 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.966;LDAF=0.0017;AVGPOST=0.9991;RSQ=0.7543;ERATE=0.0005;THETA=0.0005;AC=3;AN=2184 +20 316813 . GTTC G 1701 PASS INDEL;BAVGPOST=0.995;BRSQ=0.850;LDAF=0.0156;AVGPOST=0.9987;RSQ=0.9611;ERATE=0.0003;THETA=0.0006;AC=34;AN=2184 +20 316824 rs11472305 T TTTC 5129 PASS INDEL;BAVGPOST=0.979;BRSQ=0.863;LDAF=0.0697;AVGPOST=0.9782;RSQ=0.8837;ERATE=0.0016;THETA=0.0006;AC=153;AN=2184 +20 316839 . CT C 1122 PASS INDEL;BAVGPOST=0.980;BRSQ=0.804;LDAF=0.0505;AVGPOST=0.9743;RSQ=0.8100;ERATE=0.0040;THETA=0.0009;AC=104;AN=2184 +20 316841 . TTC T 1096 PASS INDEL;BAVGPOST=0.981;BRSQ=0.824;LDAF=0.0491;AVGPOST=0.9783;RSQ=0.8367;ERATE=0.0029;THETA=0.0005;AC=105;AN=2184 +20 316842 . TCC T 110 PASS INDEL;BAVGPOST=0.983;BRSQ=0.592;LDAF=0.0203;AVGPOST=0.9772;RSQ=0.5576;ERATE=0.0022;THETA=0.0005;AC=30;AN=2184 +20 316845 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.645;LDAF=0.0015;AVGPOST=0.9989;RSQ=0.6666;ERATE=0.0003;THETA=0.0010;AC=2;AN=2184 +20 316853 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.990;BRSQ=0.922;LDAF=0.0688;AVGPOST=0.9742;RSQ=0.8456;ERATE=0.0049;THETA=0.0006;AC=133;AN=2184 +20 316882 rs11479165 CT C 145 PASS INDEL;BAVGPOST=0.991;BRSQ=0.484;LDAF=0.0074;AVGPOST=0.9920;RSQ=0.5655;ERATE=0.0005;THETA=0.0014;AC=9;AN=2184 +20 316889 . T TTC 108 PASS INDEL;BAVGPOST=0.978;BRSQ=0.235;LDAF=0.0521;AVGPOST=0.9304;RSQ=0.4479;ERATE=0.0035;THETA=0.0016;AC=63;AN=2184 +20 316901 . T TTC 272 PASS INDEL;BAVGPOST=0.979;BRSQ=0.510;LDAF=0.0363;AVGPOST=0.9527;RSQ=0.4348;ERATE=0.0071;THETA=0.0003;AC=34;AN=2184 +20 316917 . CT C 67 PASS INDEL;BAVGPOST=0.983;BRSQ=0.483;LDAF=0.0265;AVGPOST=0.9828;RSQ=0.7509;ERATE=0.0006;THETA=0.0011;AC=51;AN=2184 +20 316918 rs112071142 TTTC T 3049 PASS INDEL;BAVGPOST=0.980;BRSQ=0.801;LDAF=0.0588;AVGPOST=0.9761;RSQ=0.8444;ERATE=0.0026;THETA=0.0005;AC=120;AN=2184 +20 316924 . CT C 8 PASS INDEL;BAVGPOST=0.999;BRSQ=0.581;LDAF=0.0053;AVGPOST=0.9932;RSQ=0.5513;ERATE=0.0004;THETA=0.0011;AC=6;AN=2184 +20 316936 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.994;BRSQ=0.651;LDAF=0.0098;AVGPOST=0.9884;RSQ=0.5505;ERATE=0.0018;THETA=0.0005;AC=13;AN=2184 +20 316939 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.997;BRSQ=0.720;LDAF=0.0071;AVGPOST=0.9899;RSQ=0.4845;ERATE=0.0010;THETA=0.0033;AC=8;AN=2184 +20 316952 . CTCTTCCTCTTCT C 15835 PASS INDEL;BAVGPOST=0.891;BRSQ=0.667;LDAF=0.1650;AVGPOST=0.9042;RSQ=0.7274;ERATE=0.0037;THETA=0.0008;AC=294;AN=2184 +20 317003 . TCC T 333 PASS INDEL;BAVGPOST=0.890;BRSQ=0.114;LDAF=0.3450;AVGPOST=0.5526;RSQ=0.1674;ERATE=0.0300;THETA=0.0011;AC=350;AN=2184 +20 317022 rs111424933 TTTC T 4776 PASS INDEL;BAVGPOST=0.982;BRSQ=0.865;LDAF=0.0561;AVGPOST=0.9837;RSQ=0.8889;ERATE=0.0017;THETA=0.0004;AC=110;AN=2184 +20 317057 . C A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.775;LDAF=0.0014;AVGPOST=0.9985;RSQ=0.6049;ERATE=0.0003;THETA=0.0002;AC=3;AN=2184 +20 317135 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=0.995;BRSQ=0.992;LDAF=0.0489;AVGPOST=0.9964;RSQ=0.9749;ERATE=0.0004;THETA=0.0025;AC=106;AN=2184 +20 317173 MERGED_DEL_2_99440 A . . SV;BAVGPOST=0.998;BRSQ=0.577;LDAF=0.0018;AVGPOST=0.9990;RSQ=0.7465;ERATE=0.0003;THETA=0.0007;CIEND=-61,92;CIPOS=-84,73;END=319201;SOURCE=BreakDancer_317182_319211,GenomeStrip_317164_319190;SVTYPE=DEL;AC=2;AN=2184 +20 317174 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.995;BRSQ=0.991;LDAF=0.0502;AVGPOST=0.9962;RSQ=0.9727;ERATE=0.0005;THETA=0.0007;AC=107;AN=2184 +20 317266 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.999;LDAF=0.1748;AVGPOST=0.9988;RSQ=0.9973;ERATE=0.0003;THETA=0.0005;AC=383;AN=2184 +20 317448 . A G 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0068;AVGPOST=0.9988;RSQ=0.9405;ERATE=0.0003;THETA=0.0002;AC=16;AN=2184 +20 317491 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.999;BRSQ=0.991;LDAF=0.0609;AVGPOST=0.9993;RSQ=0.9939;ERATE=0.0003;THETA=0.0004;AC=133;AN=2184 +20 317546 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.837;LDAF=0.0007;AVGPOST=0.9994;RSQ=0.6154;ERATE=0.0003;THETA=0.0012;AC=1;AN=2184 +20 317578 . C T 100 PASS LCSNP;EXSNP;BAVGPOST=0.996;BRSQ=0.882;LDAF=0.0180;AVGPOST=0.9981;RSQ=0.9606;ERATE=0.0003;THETA=0.0004;AC=40;AN=2184 +20 317658 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=1.000;LDAF=0.0609;AVGPOST=0.9998;RSQ=0.9980;ERATE=0.0003;THETA=0.0005;AC=133;AN=2184 +20 317676 . T C 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=1.000;LDAF=0.0608;AVGPOST=0.9999;RSQ=0.9992;ERATE=0.0003;THETA=0.0004;AC=133;AN=2184 +20 317710 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.952;LDAF=0.0014;AVGPOST=0.9997;RSQ=0.8880;ERATE=0.0004;THETA=0.0008;AC=3;AN=2184 +20 317824 . G A 100 PASS LCSNP;EXSNP;BAVGPOST=1.000;BRSQ=0.999;LDAF=0.0555;AVGPOST=0.9993;RSQ=0.9947;ERATE=0.0003;THETA=0.0009;AC=121;AN=2184 diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2068.jar b/settings/repository/edu.mit.broad/picard-private-parts-2164.jar similarity index 73% rename from settings/repository/edu.mit.broad/picard-private-parts-2068.jar rename to settings/repository/edu.mit.broad/picard-private-parts-2164.jar index bb6805d8de..4465f91f57 100644 Binary files a/settings/repository/edu.mit.broad/picard-private-parts-2068.jar and b/settings/repository/edu.mit.broad/picard-private-parts-2164.jar differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2068.xml b/settings/repository/edu.mit.broad/picard-private-parts-2164.xml similarity index 58% rename from settings/repository/edu.mit.broad/picard-private-parts-2068.xml rename to settings/repository/edu.mit.broad/picard-private-parts-2164.xml index 1ce10c1939..6a22ea2c36 100644 --- a/settings/repository/edu.mit.broad/picard-private-parts-2068.xml +++ b/settings/repository/edu.mit.broad/picard-private-parts-2164.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar old mode 100755 new mode 100644 similarity index 88% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.jar rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar index bfd06f97f8..ee5d023672 Binary files a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar and b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar differ diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml similarity index 77% rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.xml rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml index f0568def4e..5417641d3d 100644 --- a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml +++ b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf/picard-1.55.985.xml b/settings/repository/net.sf/picard-1.55.985.xml deleted file mode 100644 index e74b126047..0000000000 --- a/settings/repository/net.sf/picard-1.55.985.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/picard-1.55.985.jar b/settings/repository/net.sf/picard-1.58.1057.jar similarity index 71% rename from settings/repository/net.sf/picard-1.55.985.jar rename to settings/repository/net.sf/picard-1.58.1057.jar index a8ceaa8782..4a82a3058e 100644 Binary files a/settings/repository/net.sf/picard-1.55.985.jar and b/settings/repository/net.sf/picard-1.58.1057.jar differ diff --git a/settings/repository/net.sf/picard-1.58.1057.xml b/settings/repository/net.sf/picard-1.58.1057.xml new file mode 100644 index 0000000000..15c5b5620c --- /dev/null +++ b/settings/repository/net.sf/picard-1.58.1057.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/net.sf/sam-1.55.985.xml b/settings/repository/net.sf/sam-1.55.985.xml deleted file mode 100644 index 415063a629..0000000000 --- a/settings/repository/net.sf/sam-1.55.985.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/sam-1.55.985.jar b/settings/repository/net.sf/sam-1.58.1057.jar similarity index 82% rename from settings/repository/net.sf/sam-1.55.985.jar rename to settings/repository/net.sf/sam-1.58.1057.jar index ff3d9a9451..804e21b616 100644 Binary files a/settings/repository/net.sf/sam-1.55.985.jar and b/settings/repository/net.sf/sam-1.58.1057.jar differ diff --git a/settings/repository/net.sf/sam-1.58.1057.xml b/settings/repository/net.sf/sam-1.58.1057.xml new file mode 100644 index 0000000000..4f0dfe44e3 --- /dev/null +++ b/settings/repository/net.sf/sam-1.58.1057.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/org.broad/tribble-40.jar b/settings/repository/org.broad/tribble-46.jar similarity index 90% rename from settings/repository/org.broad/tribble-40.jar rename to settings/repository/org.broad/tribble-46.jar index 7f68b4b365..401fcfc3a9 100644 Binary files a/settings/repository/org.broad/tribble-40.jar and b/settings/repository/org.broad/tribble-46.jar differ diff --git a/settings/repository/org.broad/tribble-40.xml b/settings/repository/org.broad/tribble-46.xml similarity index 51% rename from settings/repository/org.broad/tribble-40.xml rename to settings/repository/org.broad/tribble-46.xml index 6a01b37901..bb8df5c876 100644 --- a/settings/repository/org.broad/tribble-40.xml +++ b/settings/repository/org.broad/tribble-46.xml @@ -1,3 +1,3 @@ - +