diff --git a/build.xml b/build.xml
index ce9ab0c4d5..2086d0c9ae 100644
--- a/build.xml
+++ b/build.xml
@@ -847,7 +847,6 @@
-
@@ -858,6 +857,7 @@
+
@@ -1118,6 +1118,11 @@
+
+
+
+
+
diff --git a/ivy.xml b/ivy.xml
index 96c1de844d..ee24bc3672 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -76,7 +76,7 @@
-
+
diff --git a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R
index 866766c2c0..d5ee3626f4 100644
--- a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R
+++ b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R
@@ -12,7 +12,7 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
- inputFileName = "~/Desktop/broadLocal/GATK/unstable/wgs.jobreport.txt"
+ inputFileName = "Q-26618@gsa4.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
@@ -129,9 +129,11 @@ plotGroup <- function(groupTable) {
# as above, but averaging over all iterations
groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration")
if ( dim(sub)[1] > 1 ) {
- sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
- textplot(as.data.frame(sum), show.rownames=F)
- title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
+ try({ # need a try here because we will fail to reduce when there's just a single iteration
+ sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
+ textplot(as.data.frame(sum), show.rownames=F)
+ title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
+ }, silent=T)
}
}
@@ -149,6 +151,35 @@ convertUnits <- function(gatkReportData) {
lapply(gatkReportData, convertGroup)
}
+#
+# Plots runtimes by analysis name and exechosts
+#
+# Useful to understand the performance of analysis jobs by hosts,
+# and to debug problematic nodes
+#
+plotTimeByHost <- function(gatkReportData) {
+ fields = c("analysisName", "exechosts", "runtime")
+
+ runtimes = data.frame()
+ for ( report in gatkReportData ) {
+ runtimes = rbind(runtimes, report[,fields])
+ }
+
+ plotMe <- function(name, vis) {
+ p = ggplot(data=runtimes, aes(x=exechosts, y=runtime, group=exechosts, color=exechosts))
+ p = p + facet_grid(analysisName ~ ., scale="free")
+ p = p + vis()
+ p = p + xlab("Job execution host")
+ p = p + opts(title = paste(name, "of job runtimes by analysis name and execution host"))
+ p = p + ylab(paste("Distribution of runtimes", RUNTIME_UNITS))
+ p = p + opts(axis.text.x=theme_text(angle=45, hjust=1, vjust=1))
+ print(p)
+ }
+
+ plotMe("Boxplot", geom_boxplot)
+ plotMe("Jittered points", geom_jitter)
+}
+
# read the table
gatkReportData <- gsa.read.gatkreport(inputFileName)
@@ -162,7 +193,9 @@ if ( ! is.na(outputPDF) ) {
plotJobsGantt(gatkReportData, T, F)
plotJobsGantt(gatkReportData, F, F)
plotProgressByTime(gatkReportData)
+plotTimeByHost(gatkReportData)
for ( group in gatkReportData ) {
+ print(group)
plotGroup(group)
}
diff --git a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java
new file mode 100644
index 0000000000..4b1c7a9994
--- /dev/null
+++ b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package net.sf.picard.sam;
+
+import net.sf.picard.PicardException;
+
+import java.util.*;
+import java.lang.reflect.Constructor;
+
+import net.sf.samtools.*;
+import net.sf.samtools.util.CloseableIterator;
+
+/**
+ * Provides an iterator interface for merging multiple underlying iterators into a single
+ * iterable stream. The underlying iterators/files must all have the same sort order unless
+ * the requested output format is unsorted, in which case any combination is valid.
+ */
+public class MergingSamRecordIterator implements CloseableIterator {
+ private final PriorityQueue pq;
+ private final SamFileHeaderMerger samHeaderMerger;
+ private final Collection readers;
+ private final SAMFileHeader.SortOrder sortOrder;
+ private final SAMRecordComparator comparator;
+
+ private boolean initialized = false;
+ private boolean iterationStarted = false;
+
+ /**
+ * Constructs a new merging iterator with the same set of readers and sort order as
+ * provided by the header merger parameter.
+ * @param headerMerger The merged header and contents of readers.
+ * @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
+ * @deprecated replaced by (SamFileHeaderMerger, Collection, boolean)
+ */
+ public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) {
+ this(headerMerger, headerMerger.getReaders(), forcePresorted);
+ }
+
+ /**
+ * Constructs a new merging iterator with the same set of readers and sort order as
+ * provided by the header merger parameter.
+ * @param headerMerger The merged header and contents of readers.
+ * @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order.
+ */
+ public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection readers, final boolean assumeSorted) {
+ this.samHeaderMerger = headerMerger;
+ this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
+ this.comparator = getComparator();
+ this.readers = readers;
+
+ this.pq = new PriorityQueue(readers.size());
+
+ for (final SAMFileReader reader : readers) {
+ if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted &&
+ reader.getFileHeader().getSortOrder() != this.sortOrder){
+ throw new PicardException("Files are not compatible with sort order");
+ }
+ }
+ }
+
+ /**
+ * Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval,
+ * rather than iterating over every read in the backing file or stream.
+ * @param reader Reader to add to the merging iterator.
+ * @param iterator Iterator traversing over reader contents.
+ */
+ public void addIterator(final SAMFileReader reader, final CloseableIterator iterator) {
+ if(iterationStarted)
+ throw new PicardException("Cannot add another iterator; iteration has already begun");
+ if(!samHeaderMerger.containsHeader(reader.getFileHeader()))
+ throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger");
+ final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator);
+ addIfNotEmpty(comparableIterator);
+ initialized = true;
+ }
+
+ private void startIterationIfRequired() {
+ if(initialized)
+ return;
+ for(SAMFileReader reader: readers)
+ addIterator(reader,reader.iterator());
+ iterationStarted = true;
+ }
+
+ /**
+ * Close down all open iterators.
+ */
+ public void close() {
+ // Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue.
+ for(CloseableIterator iterator: pq)
+ iterator.close();
+ }
+
+ /** Returns true if any of the underlying iterators has more records, otherwise false. */
+ public boolean hasNext() {
+ startIterationIfRequired();
+ return !this.pq.isEmpty();
+ }
+
+ /** Returns the next record from the top most iterator during merging. */
+ public SAMRecord next() {
+ startIterationIfRequired();
+
+ final ComparableSamRecordIterator iterator = this.pq.poll();
+ final SAMRecord record = iterator.next();
+ addIfNotEmpty(iterator);
+ record.setHeader(this.samHeaderMerger.getMergedHeader());
+
+ // Fix the read group if needs be
+ if (this.samHeaderMerger.hasReadGroupCollisions()) {
+ final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
+ if (oldGroupId != null ) {
+ final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId);
+ record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId);
+ }
+ }
+
+ // Fix the program group if needs be
+ if (this.samHeaderMerger.hasProgramGroupCollisions()) {
+ final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
+ if (oldGroupId != null ) {
+ final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId);
+ record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId);
+ }
+ }
+
+ // Fix up the sequence indexes if needs be
+ if (this.samHeaderMerger.hasMergedSequenceDictionary()) {
+ if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex()));
+ }
+
+ if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex()));
+ }
+ }
+
+ return record;
+ }
+
+ /**
+ * Adds iterator to priority queue. If the iterator has more records it is added
+ * otherwise it is closed and not added.
+ */
+ private void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
+ if (iterator.hasNext()) {
+ pq.offer(iterator);
+ }
+ else {
+ iterator.close();
+ }
+ }
+
+ /** Unsupported operation. */
+ public void remove() {
+ throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
+ }
+
+ /**
+ * Get the right comparator for a given sort order (coordinate, alphabetic). In the
+ * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
+ * ordering.
+ */
+ private SAMRecordComparator getComparator() {
+ // For unsorted build a fake comparator that compares based on object ID
+ if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
+ return new SAMRecordComparator() {
+ public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
+ return System.identityHashCode(lhs) - System.identityHashCode(rhs);
+ }
+
+ public int compare(final SAMRecord lhs, final SAMRecord rhs) {
+ return fileOrderCompare(lhs, rhs);
+ }
+ };
+ }
+ if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
+ return new MergedSequenceDictionaryCoordinateOrderComparator();
+ }
+
+ // Otherwise try and figure out what kind of comparator to return and build it
+ return this.sortOrder.getComparatorInstance();
+ }
+
+ /** Returns the merged header that the merging iterator is working from. */
+ public SAMFileHeader getMergedHeader() {
+ return this.samHeaderMerger.getMergedHeader();
+ }
+
+ /**
+ * Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged
+ * sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids
+ * more copy & paste.
+ */
+ private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator {
+
+ public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
+ final int referenceIndex1 = getReferenceIndex(samRecord1);
+ final int referenceIndex2 = getReferenceIndex(samRecord2);
+ if (referenceIndex1 != referenceIndex2) {
+ if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ return 1;
+ } else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ return -1;
+ } else {
+ return referenceIndex1 - referenceIndex2;
+ }
+ }
+ if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ // Both are unmapped.
+ return 0;
+ }
+ return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart();
+ }
+
+ private int getReferenceIndex(final SAMRecord samRecord) {
+ if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex());
+ }
+ if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex());
+ }
+ return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
+ }
+ }
+}
diff --git a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java
new file mode 100644
index 0000000000..f78cd81dac
--- /dev/null
+++ b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java
@@ -0,0 +1,744 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.picard.sam;
+
+import java.util.*;
+
+import net.sf.picard.PicardException;
+import net.sf.samtools.AbstractSAMHeaderRecord;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMProgramRecord;
+import net.sf.samtools.SAMReadGroupRecord;
+import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
+import net.sf.samtools.util.SequenceUtil;
+
+/**
+ * Merges SAMFileHeaders that have the same sequences into a single merged header
+ * object while providing read group translation for cases where read groups
+ * clash across input headers.
+ */
+public class SamFileHeaderMerger {
+ //Super Header to construct
+ private final SAMFileHeader mergedHeader;
+ private Collection readers;
+ private final Collection headers;
+
+ //Translation of old group ids to new group ids
+ private final Map> samReadGroupIdTranslation =
+ new IdentityHashMap>();
+
+ //the read groups from different files use the same group ids
+ private boolean hasReadGroupCollisions = false;
+
+ //the program records from different files use the same program record ids
+ private boolean hasProgramGroupCollisions = false;
+
+ //Translation of old program group ids to new program group ids
+ private Map> samProgramGroupIdTranslation =
+ new IdentityHashMap>();
+
+ private boolean hasMergedSequenceDictionary = false;
+
+ // Translation of old sequence dictionary ids to new dictionary ids
+ // This is an IdentityHashMap because it can be quite expensive to compute the hashCode for
+ // large SAMFileHeaders. It is possible that two input files will have identical headers so that
+ // the regular HashMap would fold them together, but the value stored in each of the two
+ // Map entries will be the same, so it should not hurt anything.
+ private final Map> samSeqDictionaryIdTranslationViaHeader =
+ new IdentityHashMap>();
+
+ //HeaderRecordFactory that creates SAMReadGroupRecord instances.
+ private static final HeaderRecordFactory READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory() {
+ public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) {
+ return new SAMReadGroupRecord(id, srcReadGroupRecord);
+ }
+ };
+
+ //HeaderRecordFactory that creates SAMProgramRecord instances.
+ private static final HeaderRecordFactory PROGRAM_RECORD_FACTORY = new HeaderRecordFactory() {
+ public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) {
+ return new SAMProgramRecord(id, srcProgramRecord);
+ }
+ };
+
+ //comparator used to sort lists of program group and read group records
+ private static final Comparator RECORD_ID_COMPARATOR = new Comparator() {
+ public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) {
+ return o1.getId().compareTo(o2.getId());
+ }
+ };
+
+ /**
+ * Create SAMFileHeader with additional information. Required that sequence dictionaries agree.
+ *
+ * @param readers sam file readers to combine
+ * @param sortOrder sort order new header should have
+ * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean)
+ */
+ public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) {
+ this(readers, sortOrder, false);
+ }
+
+ /**
+ * Create SAMFileHeader with additional information.
+ *
+ * @param readers sam file readers to combine
+ * @param sortOrder sort order new header should have
+ * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
+ * all input sequence dictionaries be identical.
+ * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean)
+ */
+ public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) {
+ this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries);
+ this.readers = readers;
+ }
+
+ /**
+ * Create SAMFileHeader with additional information.. This is the preferred constructor.
+ *
+ * @param sortOrder sort order new header should have
+ * @param headers sam file headers to combine
+ * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
+ * all input sequence dictionaries be identical.
+ */
+ public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection headers, final boolean mergeDictionaries) {
+ this.headers = headers;
+ this.mergedHeader = new SAMFileHeader();
+
+ SAMSequenceDictionary sequenceDictionary;
+ try {
+ sequenceDictionary = getSequenceDictionary(headers);
+ this.hasMergedSequenceDictionary = false;
+ }
+ catch (SequenceUtil.SequenceListsDifferException pe) {
+ if (mergeDictionaries) {
+ sequenceDictionary = mergeSequenceDictionaries(headers);
+ this.hasMergedSequenceDictionary = true;
+ }
+ else {
+ throw pe;
+ }
+ }
+
+ this.mergedHeader.setSequenceDictionary(sequenceDictionary);
+
+ // Set program that creates input alignments
+ for (final SAMProgramRecord program : mergeProgramGroups(headers)) {
+ this.mergedHeader.addProgramRecord(program);
+ }
+
+ // Set read groups for merged header
+ final List readGroups = mergeReadGroups(headers);
+ this.mergedHeader.setReadGroups(readGroups);
+ this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none);
+
+ this.mergedHeader.setSortOrder(sortOrder);
+
+ for (final SAMFileHeader header : headers) {
+ for (final String comment : header.getComments()) {
+ this.mergedHeader.addComment(comment);
+ }
+ }
+ }
+
+ // Utilility method to make use with old constructor
+ private static List getHeadersFromReaders(Collection readers) {
+ List headers = new ArrayList(readers.size());
+ for (SAMFileReader reader : readers) {
+ headers.add(reader.getFileHeader());
+ }
+ return headers;
+ }
+
+
+ /**
+ * Checks to see if there are clashes where different readers are using the same read
+ * group IDs. If yes, then those IDs that collided are remapped.
+ *
+ * @param headers headers to combine
+ * @return new list of read groups constructed from all the readers
+ */
+ private List mergeReadGroups(final Collection headers) {
+ //prepare args for mergeHeaderRecords(..) call
+ final HashSet idsThatAreAlreadyTaken = new HashSet();
+
+ final List> readGroupsToProcess = new LinkedList>();
+ for (final SAMFileHeader header : headers) {
+ for (final SAMReadGroupRecord readGroup : header.getReadGroups()) {
+ //verify that there are no existing id collisions in this input file
+ if(!idsThatAreAlreadyTaken.add(readGroup.getId()))
+ throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")");
+
+ readGroupsToProcess.add(new HeaderRecordAndFileHeader(readGroup, header));
+ }
+ idsThatAreAlreadyTaken.clear();
+ }
+
+ final List result = new LinkedList();
+
+ hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result);
+
+ //sort the result list by record id
+ Collections.sort(result, RECORD_ID_COMPARATOR);
+
+ return result;
+ }
+
+
+ /**
+ * Checks to see if there are clashes where different readers are using the same program
+ * group IDs. If yes, then those IDs that collided are remapped.
+ *
+ * @param headers headers to combine
+ * @return new list of program groups constructed from all the readers
+ */
+ private List mergeProgramGroups(final Collection headers) {
+
+ final List overallResult = new LinkedList();
+
+ //this Set will accumulate all SAMProgramRecord ids that have been encountered so far.
+ final HashSet idsThatAreAlreadyTaken = new HashSet();
+
+ //need to process all program groups
+ List> programGroupsLeftToProcess = new LinkedList>();
+ for (final SAMFileHeader header : headers) {
+ for (final SAMProgramRecord programGroup : header.getProgramRecords()) {
+ //verify that there are no existing id collisions in this input file
+ if(!idsThatAreAlreadyTaken.add(programGroup.getId()))
+ throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")");
+
+ programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader(programGroup, header));
+ }
+ idsThatAreAlreadyTaken.clear();
+ }
+
+ //A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to
+ //another program group header (lets say ID=1 PN=A) to indicate that the given file was
+ //processed by program A followed by program B. These PP attributes potentially
+ //connect headers into one or more tree structures. Merging is done by
+ //first merging all headers that don't have PP attributes (eg. tree roots),
+ //then updating and merging all headers whose PPs point to the tree-root headers,
+ //and so on until all program group headers are processed.
+
+ //currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots).
+ List< HeaderRecordAndFileHeader > currentProgramGroups = new LinkedList>();
+ for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
+ final HeaderRecordAndFileHeader pair = programGroupsLeftToProcessIterator.next();
+ if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) {
+ programGroupsLeftToProcessIterator.remove();
+ currentProgramGroups.add(pair);
+ }
+ }
+
+ //merge currentProgramGroups
+ while(!currentProgramGroups.isEmpty())
+ {
+ final List currentResult = new LinkedList();
+
+ hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult);
+
+ //add currentResults to overallResults
+ overallResult.addAll(currentResult);
+
+ //apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess
+ currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false);
+ programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true);
+
+ //find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups),
+ //and move them to the list of programGroupsToProcessNext.
+ LinkedList> programGroupsToProcessNext = new LinkedList>();
+ for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
+ final HeaderRecordAndFileHeader pairLeftToProcess = programGroupsLeftToProcessIterator.next();
+ final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
+ //find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file)
+ for(final HeaderRecordAndFileHeader justProcessedPair : currentProgramGroups) {
+ String idJustProcessed = justProcessedPair.getHeaderRecord().getId();
+ if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) {
+ programGroupsLeftToProcessIterator.remove();
+ programGroupsToProcessNext.add(pairLeftToProcess);
+ break;
+ }
+ }
+ }
+
+ currentProgramGroups = programGroupsToProcessNext;
+ }
+
+ //verify that all records were processed
+ if(!programGroupsLeftToProcess.isEmpty()) {
+ StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n");
+ for( final HeaderRecordAndFileHeader pair : programGroupsLeftToProcess ) {
+ SAMProgramRecord record = pair.getHeaderRecord();
+ errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n");
+ }
+ throw new PicardException(errorMsg.toString());
+ }
+
+ //sort the result list by record id
+ Collections.sort(overallResult, RECORD_ID_COMPARATOR);
+
+ return overallResult;
+ }
+
+
+ /**
+ * Utility method that takes a list of program groups and remaps all their
+ * ids (including ppIds if requested) using the given idTranslationTable.
+ *
+ * NOTE: when remapping, this method creates new SAMProgramRecords and
+ * doesn't mutate any records in the programGroups list.
+ *
+ * @param programGroups The program groups to translate.
+ * @param idTranslationTable The translation table.
+ * @param translatePpIds Whether ppIds should be translated as well.
+ *
+ * @return The list of translated records.
+ */
+ private List> translateIds(
+ List> programGroups,
+ Map> idTranslationTable,
+ boolean translatePpIds) {
+
+ //go through programGroups and translate any IDs and PPs based on the idTranslationTable.
+ List> result = new LinkedList>();
+ for(final HeaderRecordAndFileHeader pair : programGroups ) {
+ final SAMProgramRecord record = pair.getHeaderRecord();
+ final String id = record.getProgramGroupId();
+ final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
+
+ final SAMFileHeader header = pair.getFileHeader();
+ final Map translations = idTranslationTable.get(header);
+
+ //see if one or both ids need to be translated
+ SAMProgramRecord translatedRecord = null;
+ if(translations != null)
+ {
+ String translatedId = translations.get( id );
+ String translatedPpId = translatePpIds ? translations.get( ppId ) : null;
+
+ boolean needToTranslateId = translatedId != null && !translatedId.equals(id);
+ boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId);
+
+ if(needToTranslateId && needToTranslatePpId) {
+ translatedRecord = new SAMProgramRecord(translatedId, record);
+ translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
+ } else if(needToTranslateId) {
+ translatedRecord = new SAMProgramRecord(translatedId, record);
+ } else if(needToTranslatePpId) {
+ translatedRecord = new SAMProgramRecord(id, record);
+ translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
+ }
+ }
+
+ if(translatedRecord != null) {
+ result.add(new HeaderRecordAndFileHeader(translatedRecord, header));
+ } else {
+ result.add(pair); //keep the original record
+ }
+ }
+
+ return result;
+ }
+
+
+ /**
+ * Utility method for merging a List of AbstractSAMHeaderRecords. If it finds
+ * records that have identical ids and attributes, it will collapse them
+ * into one record. If it finds records that have identical ids but
+ * non-identical attributes, this is treated as a collision. When collision happens,
+ * the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable.
+ *
+ * NOTE: Non-collided records also get recorded in the idTranslationTable as
+ * old-id to old-id. This way, an idTranslationTable lookup should never return null.
+ *
+ * @param headerRecords The header records to merge.
+ * @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord.
+ * @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped.
+ * @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id
+ * mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as
+ * old-id to old-id. This way, an idTranslationTable lookup should never return null.
+ *
+ * @param result The list of merged header records.
+ *
+ * @return True if there were collisions.
+ */
+ private boolean mergeHeaderRecords(final List> headerRecords, HeaderRecordFactory headerRecordFactory,
+ final HashSet idsThatAreAlreadyTaken, Map> idTranslationTable, List result) {
+
+ //The outer Map bins the header records by their ids. The nested Map further collapses
+ //header records which, in addition to having the same id, also have identical attributes.
+ //In other words, each key in the nested map represents one or more
+ //header records which have both identical ids and identical attributes. The List of
+ //SAMFileHeaders keeps track of which readers these header record(s) came from.
+ final Map>> idToRecord =
+ new HashMap>>();
+
+ //Populate the idToRecord and seenIds data structures
+ for (final HeaderRecordAndFileHeader pair : headerRecords) {
+ final RecordType record = pair.getHeaderRecord();
+ final SAMFileHeader header = pair.getFileHeader();
+ final String recordId = record.getId();
+ Map> recordsWithSameId = idToRecord.get(recordId);
+ if(recordsWithSameId == null) {
+ recordsWithSameId = new LinkedHashMap>();
+ idToRecord.put(recordId, recordsWithSameId);
+ }
+
+ List fileHeaders = recordsWithSameId.get(record);
+ if(fileHeaders == null) {
+ fileHeaders = new LinkedList();
+ recordsWithSameId.put(record, fileHeaders);
+ }
+
+ fileHeaders.add(header);
+ }
+
+ //Resolve any collisions between header records by remapping their ids.
+ boolean hasCollisions = false;
+ for (final Map.Entry>> entry : idToRecord.entrySet() )
+ {
+ final String recordId = entry.getKey();
+ final Map> recordsWithSameId = entry.getValue();
+
+
+ for( Map.Entry> recordWithUniqueAttr : recordsWithSameId.entrySet()) {
+ final RecordType record = recordWithUniqueAttr.getKey();
+ final List fileHeaders = recordWithUniqueAttr.getValue();
+
+ String newId;
+ if(!idsThatAreAlreadyTaken.contains(recordId)) {
+ //don't remap 1st record. If there are more records
+ //with this id, they will be remapped in the 'else'.
+ newId = recordId;
+ idsThatAreAlreadyTaken.add(recordId);
+ } else {
+ //there is more than one record with this id.
+ hasCollisions = true;
+
+ //find a unique newId for this record
+ int idx=1;
+ while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++)))
+ ;
+
+ idsThatAreAlreadyTaken.add( newId );
+ }
+
+ for(SAMFileHeader fileHeader : fileHeaders) {
+ Map readerTranslationTable = idTranslationTable.get(fileHeader);
+ if(readerTranslationTable == null) {
+ readerTranslationTable = new HashMap();
+ idTranslationTable.put(fileHeader, readerTranslationTable);
+ }
+ readerTranslationTable.put(recordId, newId);
+ }
+
+ result.add( headerRecordFactory.createRecord(newId, record) );
+ }
+ }
+
+ return hasCollisions;
+ }
+
+
+ /**
+ * Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence
+ * are different from one another.
+ *
+ * @param headers headers to pull sequences from
+ * @return sequences from files. Each file should have the same sequence
+ */
+ private SAMSequenceDictionary getSequenceDictionary(final Collection headers) {
+ SAMSequenceDictionary sequences = null;
+ for (final SAMFileHeader header : headers) {
+
+ if (sequences == null) {
+ sequences = header.getSequenceDictionary();
+ }
+ else {
+ final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
+ SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences);
+ }
+ }
+
+ return sequences;
+ }
+
+ /**
+ * Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries.
+ *
+ * @param headers headers to pull sequences from
+ * @return sequences from files. Each file should have the same sequence
+ */
+ private SAMSequenceDictionary mergeSequenceDictionaries(final Collection headers) {
+ SAMSequenceDictionary sequences = new SAMSequenceDictionary();
+ for (final SAMFileHeader header : headers) {
+ final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
+ sequences = mergeSequences(sequences, currentSequences);
+ }
+ // second pass, make a map of the original seqeunce id -> new sequence id
+ createSequenceMapping(headers, sequences);
+ return sequences;
+ }
+
+ /**
+ * They've asked to merge the sequence headers. What we support right now is finding the sequence name superset.
+ *
+ * @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals.
+ * @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict.
+ * @return A new sequence dictionary that resulting from merging the two inputs.
+ */
+ private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) {
+
+ // a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict.
+ LinkedList holder = new LinkedList();
+
+ // Return value will be created from this.
+ LinkedList resultingDict = new LinkedList();
+ for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) {
+ resultingDict.add(sequenceRecord);
+ }
+
+ // Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
+ int prevloc = -1;
+ // Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
+ SAMSequenceRecord previouslyMerged = null;
+
+ for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) {
+ // Does it already exist in resultingDict?
+ int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName());
+ if (loc == -1) {
+ // If doesn't already exist in resultingDict, save it an decide where to insert it later.
+ holder.add(sequenceRecord.clone());
+ } else if (prevloc > loc) {
+ // If sequenceRecord already exists in resultingDict, but prior to the previous one
+ // from mergeIntoDict that already existed, cannot merge.
+ throw new PicardException("Cannot merge sequence dictionaries because sequence " +
+ sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() +
+ " are in different orders in two input sequence dictionaries.");
+ } else {
+ // Since sequenceRecord already exists in resultingDict, don't need to add it.
+ // Add in all the sequences prior to it that have been held in holder.
+ resultingDict.addAll(loc, holder);
+ // Remember the index of sequenceRecord so can check for merge imcompatibility.
+ prevloc = loc + holder.size();
+ previouslyMerged = sequenceRecord;
+ holder.clear();
+ }
+ }
+ // Append anything left in holder.
+ if (holder.size() != 0) {
+ resultingDict.addAll(holder);
+ }
+ return new SAMSequenceDictionary(resultingDict);
+ }
+
+ /**
+ * Find sequence in list.
+ * @param list List to search for the sequence name.
+ * @param sequenceName Name to search for.
+ * @return Index of SAMSequenceRecord with the given name in list, or -1 if not found.
+ */
+ private static int getIndexOfSequenceName(final List list, final String sequenceName) {
+ for (int i = 0; i < list.size(); ++i) {
+ if (list.get(i).getSequenceName().equals(sequenceName)) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged
+ * list of sequence id's.
+ * @param headers the collections of headers.
+ * @param masterDictionary the superset dictionary we've created.
+ */
+ private void createSequenceMapping(final Collection headers, SAMSequenceDictionary masterDictionary) {
+ LinkedList resultingDictStr = new LinkedList();
+ for (SAMSequenceRecord r : masterDictionary.getSequences()) {
+ resultingDictStr.add(r.getSequenceName());
+ }
+ for (final SAMFileHeader header : headers) {
+ Map seqMap = new HashMap();
+ SAMSequenceDictionary dict = header.getSequenceDictionary();
+ for (SAMSequenceRecord rec : dict.getSequences()) {
+ seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName()));
+ }
+ this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap);
+ }
+ }
+
+
+
+ /**
+ * Returns the read group id that should be used for the input read and RG id.
+ *
+ * @deprecated replaced by getReadGroupId(SAMFileHeader, String)
+ * */
+ public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) {
+ return getReadGroupId(reader.getFileHeader(), originalReadGroupId);
+ }
+
+ /** Returns the read group id that should be used for the input read and RG id. */
+ public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) {
+ return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId);
+ }
+
+ /**
+ * @param reader one of the input files
+ * @param originalProgramGroupId a program group ID from the above input file
+ * @return new ID from the merged list of program groups in the output file
+ * @deprecated replaced by getProgramGroupId(SAMFileHeader, String)
+ */
+ public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) {
+ return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId);
+ }
+
+ /**
+ * @param header one of the input headers
+ * @param originalProgramGroupId a program group ID from the above input file
+ * @return new ID from the merged list of program groups in the output file
+ */
+ public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) {
+ return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId);
+ }
+
+ /** Returns true if there are read group duplicates within the merged headers. */
+ public boolean hasReadGroupCollisions() {
+ return this.hasReadGroupCollisions;
+ }
+
+ /** Returns true if there are program group duplicates within the merged headers. */
+ public boolean hasProgramGroupCollisions() {
+ return hasProgramGroupCollisions;
+ }
+
+ /** @return if we've merged the sequence dictionaries, return true */
+ public boolean hasMergedSequenceDictionary() {
+ return hasMergedSequenceDictionary;
+ }
+
+ /** Returns the merged header that should be written to any output merged file. */
+ public SAMFileHeader getMergedHeader() {
+ return this.mergedHeader;
+ }
+
+ /** Returns the collection of readers that this header merger is working with. May return null.
+ * @deprecated replaced by getHeaders()
+ */
+ public Collection getReaders() {
+ return this.readers;
+ }
+
+ /** Returns the collection of readers that this header merger is working with.
+ */
+ public Collection getHeaders() {
+ return this.headers;
+ }
+
+ /**
+ * Tells whether this header merger contains a given SAM file header. Note that header presence
+ * is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for
+ * reasons of performance.
+ * @param header header to check for.
+ * @return True if the header exists in this HeaderMerger. False otherwise.
+ */
+ boolean containsHeader(SAMFileHeader header) {
+ for(SAMFileHeader headerMergerHeader: headers) {
+ if(headerMergerHeader == header)
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * returns the new mapping for a specified reader, given it's old sequence index
+ * @param reader the reader
+ * @param oldReferenceSequenceIndex the old sequence (also called reference) index
+ * @return the new index value
+ * @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer)
+ */
+ public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) {
+ return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex);
+ }
+
+ /**
+ * Another mechanism for getting the new sequence index, for situations in which the reader is not available.
+ * Note that if the SAMRecord has already had its header replaced with the merged header, this won't work.
+ * @param header The original header for the input record in question.
+ * @param oldReferenceSequenceIndex The original sequence index.
+ * @return the new index value that is compatible with the merged sequence index.
+ */
+ public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) {
+ final Map mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header);
+ if (mapping == null) {
+ throw new PicardException("No sequence dictionary mapping available for header: " + header);
+ }
+
+ final Integer newIndex = mapping.get(oldReferenceSequenceIndex);
+ if (newIndex == null) {
+ throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header);
+ }
+
+ return newIndex;
+ }
+
+
+ /**
+ * Implementations of this interface are used by mergeHeaderRecords(..) to instantiate
+ * specific subclasses of AbstractSAMHeaderRecord.
+ */
+ private static interface HeaderRecordFactory {
+
+ /**
+ * Constructs a new instance of RecordType.
+ * @param id The id of the new record.
+ * @param srcRecord Except for the id, the new record will be a copy of this source record.
+ */
+ public RecordType createRecord(final String id, RecordType srcRecord);
+ }
+
+ /**
+ * Struct that groups together a subclass of AbstractSAMHeaderRecord with the
+ * SAMFileHeader that it came from.
+ */
+ private static class HeaderRecordAndFileHeader {
+ private RecordType headerRecord;
+ private SAMFileHeader samFileHeader;
+
+ public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) {
+ this.headerRecord = headerRecord;
+ this.samFileHeader = samFileHeader;
+ }
+
+ public RecordType getHeaderRecord() {
+ return headerRecord;
+ }
+ public SAMFileHeader getFileHeader() {
+ return samFileHeader;
+ }
+ }
+}
diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java
new file mode 100644
index 0000000000..5005b6265f
--- /dev/null
+++ b/public/java/src/net/sf/samtools/BAMFileReader.java
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package net.sf.samtools;
+
+
+import net.sf.samtools.util.*;
+import net.sf.samtools.SAMFileReader.ValidationStringency;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * Internal class for reading and querying BAM files.
+ */
+class BAMFileReader extends SAMFileReader.ReaderImplementation {
+ // True if reading from a File rather than an InputStream
+ private boolean mIsSeekable = false;
+
+ // For converting bytes into other primitive types
+ private BinaryCodec mStream = null;
+
+ // Underlying compressed data stream.
+ private final BAMInputStream mInputStream;
+ private SAMFileHeader mFileHeader = null;
+
+ // Populated if the file is seekable and an index exists
+ private File mIndexFile;
+ private BAMIndex mIndex = null;
+ private long mFirstRecordPointer = 0;
+ private CloseableIterator mCurrentIterator = null;
+
+ // If true, all SAMRecords are fully decoded as they are read.
+ private final boolean eagerDecode;
+
+ // For error-checking.
+ private ValidationStringency mValidationStringency;
+
+ // For creating BAMRecords
+ private SAMRecordFactory samRecordFactory;
+
+ /**
+ * Use the caching index reader implementation rather than the disk-hit-per-file model.
+ */
+ private boolean mEnableIndexCaching = false;
+
+ /**
+ * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
+ */
+ private boolean mEnableIndexMemoryMapping = true;
+
+ /**
+ * Add information about the origin (reader and position) to SAM records.
+ */
+ private SAMFileReader mFileReader = null;
+
+ /**
+ * Prepare to read BAM from a stream (not seekable)
+ * @param stream source of bytes.
+ * @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
+ * @param validationStringency Controls how to handle invalidate reads or header lines.
+ */
+ BAMFileReader(final InputStream stream,
+ final File indexFile,
+ final boolean eagerDecode,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ mIndexFile = indexFile;
+ mIsSeekable = false;
+ mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
+ mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
+ this.eagerDecode = eagerDecode;
+ this.mValidationStringency = validationStringency;
+ this.samRecordFactory = factory;
+ readHeader(null);
+ }
+
+ /**
+ * Prepare to read BAM from a file (seekable)
+ * @param file source of bytes.
+ * @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
+ * @param validationStringency Controls how to handle invalidate reads or header lines.
+ */
+ BAMFileReader(final File file,
+ final File indexFile,
+ final boolean eagerDecode,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
+ if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
+ System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
+ " is older than BAM " + file.getAbsolutePath());
+ }
+ }
+
+ BAMFileReader(final SeekableStream strm,
+ final File indexFile,
+ final boolean eagerDecode,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
+ indexFile,
+ eagerDecode,
+ strm.getSource(),
+ validationStringency,
+ factory);
+ }
+
+ private BAMFileReader(final BAMInputStream inputStream,
+ final File indexFile,
+ final boolean eagerDecode,
+ final String source,
+ final ValidationStringency validationStringency,
+ final SAMRecordFactory factory)
+ throws IOException {
+ mIndexFile = indexFile;
+ mIsSeekable = true;
+ mInputStream = inputStream;
+ mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
+ this.eagerDecode = eagerDecode;
+ this.mValidationStringency = validationStringency;
+ this.samRecordFactory = factory;
+ readHeader(source);
+ mFirstRecordPointer = inputStream.getFilePointer();
+ }
+
+ /**
+ * If true, writes the source of every read into the source SAMRecords.
+ * @param enabled true to write source information into each SAMRecord.
+ */
+ void enableFileSource(final SAMFileReader reader, final boolean enabled) {
+ this.mFileReader = enabled ? reader : null;
+ }
+
+ /**
+ * If true, uses the caching version of the index reader.
+ * @param enabled true to write source information into each SAMRecord.
+ */
+ public void enableIndexCaching(final boolean enabled) {
+ if(mIndex != null)
+ throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
+ this.mEnableIndexCaching = enabled;
+ }
+
+ /**
+ * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
+ * This is slower but more scalable when accessing large numbers of BAM files sequentially.
+ * @param enabled True to use memory mapping, false to use regular I/O.
+ */
+ public void enableIndexMemoryMapping(final boolean enabled) {
+ if (mIndex != null) {
+ throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
+ }
+ this.mEnableIndexMemoryMapping = enabled;
+ }
+
+ @Override void enableCrcChecking(final boolean enabled) {
+ this.mInputStream.setCheckCrcs(enabled);
+ }
+
+ @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
+
+ /**
+ * @return true if ths is a BAM file, and has an index
+ */
+ public boolean hasIndex() {
+ return (mIndexFile != null);
+ }
+
+ /**
+ * Retrieves the index for the given file type. Ensure that the index is of the specified type.
+ * @return An index of the given type.
+ */
+ public BAMIndex getIndex() {
+ if(mIndexFile == null)
+ throw new SAMException("No index is available for this BAM file.");
+ if(mIndex == null)
+ mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
+ : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
+ return mIndex;
+ }
+
+ void close() {
+ if (mStream != null) {
+ mStream.close();
+ }
+ if (mIndex != null) {
+ mIndex.close();
+ }
+ mStream = null;
+ mFileHeader = null;
+ mIndex = null;
+ }
+
+ SAMFileHeader getFileHeader() {
+ return mFileHeader;
+ }
+
+ /**
+ * Set error-checking level for subsequent SAMRecord reads.
+ */
+ void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
+ this.mValidationStringency = validationStringency;
+ }
+
+ SAMFileReader.ValidationStringency getValidationStringency() {
+ return this.mValidationStringency;
+ }
+
+ /**
+ * Prepare to iterate through the SAMRecords in file order.
+ * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
+ * that iterator must be closed before getIterator() can be called again.
+ * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
+ * getIterator() begins its iteration where the last one left off. That is the best that can be
+ * done in that situation.
+ */
+ CloseableIterator getIterator() {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (mIsSeekable) {
+ try {
+ mInputStream.seek(mFirstRecordPointer);
+ } catch (IOException exc) {
+ throw new RuntimeException(exc.getMessage(), exc);
+ }
+ }
+ mCurrentIterator = new BAMFileIterator();
+ return mCurrentIterator;
+ }
+
+ @Override
+ CloseableIterator getIterator(final SAMFileSpan chunks) {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!(chunks instanceof BAMFileSpan)) {
+ throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
+ }
+
+ // Create an iterator over the given chunk boundaries.
+ mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
+ return mCurrentIterator;
+ }
+
+ /**
+ * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
+ * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
+ * the rightmost bound is guaranteed to be after the last read in the file.
+ * @return An unbounded pointer to the first record in the BAM file.
+ */
+ @Override
+ SAMFileSpan getFilePointerSpanningReads() {
+ return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
+ }
+
+ /**
+ * Prepare to iterate through the SAMRecords that match the given interval.
+ * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
+ * before calling any of the methods that return an iterator.
+ *
+ * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
+ * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
+ * matches the specified interval.
+ *
+ * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
+ * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
+ *
+ * @param sequence Reference sequence sought.
+ * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
+ * A value of zero implies the start of the reference sequence.
+ * @param end A value of zero implies the end of the reference sequence.
+ * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
+ * specified by start and end. If false, the SAMRecords need only overlap the interval.
+ * @return Iterator for the matching SAMRecords
+ */
+ CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!mIsSeekable) {
+ throw new UnsupportedOperationException("Cannot query stream-based BAM file");
+ }
+ mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
+ return mCurrentIterator;
+ }
+
+ /**
+ * Prepare to iterate through the SAMRecords with the given alignment start.
+ * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
+ * before calling any of the methods that return an iterator.
+ *
+ * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
+ * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
+ * matches the specified interval.
+ *
+ * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
+ * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
+ *
+ * @param sequence Reference sequence sought.
+ * @param start Alignment start sought.
+ * @return Iterator for the matching SAMRecords.
+ */
+ CloseableIterator queryAlignmentStart(final String sequence, final int start) {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!mIsSeekable) {
+ throw new UnsupportedOperationException("Cannot query stream-based BAM file");
+ }
+ mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
+ return mCurrentIterator;
+ }
+
+ public CloseableIterator queryUnmapped() {
+ if (mStream == null) {
+ throw new IllegalStateException("File reader is closed");
+ }
+ if (mCurrentIterator != null) {
+ throw new IllegalStateException("Iteration in progress");
+ }
+ if (!mIsSeekable) {
+ throw new UnsupportedOperationException("Cannot query stream-based BAM file");
+ }
+ try {
+ final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
+ if (startOfLastLinearBin != -1) {
+ mInputStream.seek(startOfLastLinearBin);
+ } else {
+ // No mapped reads in file, just start at the first read in file.
+ mInputStream.seek(mFirstRecordPointer);
+ }
+ mCurrentIterator = new BAMFileIndexUnmappedIterator();
+ return mCurrentIterator;
+ } catch (IOException e) {
+ throw new RuntimeException("IOException seeking to unmapped reads", e);
+ }
+ }
+
+ /**
+ * Reads the header from the file or stream
+ * @param source Note that this is used only for reporting errors.
+ */
+ private void readHeader(final String source)
+ throws IOException {
+
+ final byte[] buffer = new byte[4];
+ mStream.readBytes(buffer);
+ if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
+ throw new IOException("Invalid BAM file header");
+ }
+
+ final int headerTextLength = mStream.readInt();
+ final String textHeader = mStream.readString(headerTextLength);
+ final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
+ headerCodec.setValidationStringency(mValidationStringency);
+ mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
+ source);
+
+ final int sequenceCount = mStream.readInt();
+ if (mFileHeader.getSequenceDictionary().size() > 0) {
+ // It is allowed to have binary sequences but no text sequences, so only validate if both are present
+ if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
+ throw new SAMFormatException("Number of sequences in text header (" +
+ mFileHeader.getSequenceDictionary().size() +
+ ") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
+ }
+ for (int i = 0; i < sequenceCount; i++) {
+ final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
+ final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
+ if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
+ throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
+ source);
+ }
+ if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
+ throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
+ source);
+ }
+ }
+ } else {
+ // If only binary sequences are present, copy them into mFileHeader
+ final List sequences = new ArrayList(sequenceCount);
+ for (int i = 0; i < sequenceCount; i++) {
+ sequences.add(readSequenceRecord(source));
+ }
+ mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
+ }
+ }
+
+ /**
+ * Reads a single binary sequence record from the file or stream
+ * @param source Note that this is used only for reporting errors.
+ */
+ private SAMSequenceRecord readSequenceRecord(final String source) {
+ final int nameLength = mStream.readInt();
+ if (nameLength <= 1) {
+ throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
+ }
+ final String sequenceName = mStream.readString(nameLength - 1);
+ // Skip the null terminator
+ mStream.readByte();
+ final int sequenceLength = mStream.readInt();
+ return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
+ }
+
+ /**
+ * Iterator for non-indexed sequential iteration through all SAMRecords in file.
+ * Starting point of iteration is wherever current file position is when the iterator is constructed.
+ */
+ private class BAMFileIterator implements CloseableIterator {
+ private SAMRecord mNextRecord = null;
+ private final BAMRecordCodec bamRecordCodec;
+ private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
+
+ BAMFileIterator() {
+ this(true);
+ }
+
+ /**
+ * @param advance Trick to enable subclass to do more setup before advancing
+ */
+ BAMFileIterator(final boolean advance) {
+ this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
+ this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
+
+ if (advance) {
+ advance();
+ }
+ }
+
+ public void close() {
+ if (mCurrentIterator != null && this != mCurrentIterator) {
+ throw new IllegalStateException("Attempt to close non-current iterator");
+ }
+ mCurrentIterator = null;
+ }
+
+ public boolean hasNext() {
+ return (mNextRecord != null);
+ }
+
+ public SAMRecord next() {
+ final SAMRecord result = mNextRecord;
+ advance();
+ return result;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Not supported: remove");
+ }
+
+ void advance() {
+ try {
+ mNextRecord = getNextRecord();
+
+ if (mNextRecord != null) {
+ ++this.samRecordIndex;
+ // Because some decoding is done lazily, the record needs to remember the validation stringency.
+ mNextRecord.setValidationStringency(mValidationStringency);
+
+ if (mValidationStringency != ValidationStringency.SILENT) {
+ final List validationErrors = mNextRecord.isValid();
+ SAMUtils.processValidationErrors(validationErrors,
+ this.samRecordIndex, BAMFileReader.this.getValidationStringency());
+ }
+ }
+ if (eagerDecode && mNextRecord != null) {
+ mNextRecord.eagerDecode();
+ }
+ } catch (IOException exc) {
+ throw new RuntimeException(exc.getMessage(), exc);
+ }
+ }
+
+ /**
+ * Read the next record from the input stream.
+ */
+ SAMRecord getNextRecord() throws IOException {
+ final long startCoordinate = mInputStream.getFilePointer();
+ final SAMRecord next = bamRecordCodec.decode();
+ final long stopCoordinate = mInputStream.getFilePointer();
+
+ if(mFileReader != null && next != null)
+ next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
+
+ return next;
+ }
+
+ /**
+ * @return The record that will be return by the next call to next()
+ */
+ protected SAMRecord peek() {
+ return mNextRecord;
+ }
+ }
+
+ /**
+ * Prepare to iterate through SAMRecords matching the target interval.
+ * @param sequence Desired reference sequence.
+ * @param start 1-based start of target interval, inclusive.
+ * @param end 1-based end of target interval, inclusive.
+ * @param queryType contained, overlapping, or starting-at query.
+ */
+ private CloseableIterator createIndexIterator(final String sequence,
+ final int start,
+ final int end,
+ final QueryType queryType) {
+ long[] filePointers = null;
+
+ // Hit the index to determine the chunk boundaries for the required data.
+ final SAMFileHeader fileHeader = getFileHeader();
+ final int referenceIndex = fileHeader.getSequenceIndex(sequence);
+ if (referenceIndex != -1) {
+ final BAMIndex fileIndex = getIndex();
+ final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
+ filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
+ }
+
+ // Create an iterator over the above chunk boundaries.
+ final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
+
+ // Add some preprocessing filters for edge-case reads that don't fit into this
+ // query type.
+ return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
+ }
+
+ enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
+
+ /**
+ * Look for BAM index file according to standard naming convention.
+ *
+ * @param dataFile BAM file name.
+ * @return Index file name, or null if not found.
+ */
+ private static File findIndexFile(final File dataFile) {
+ // If input is foo.bam, look for foo.bai
+ final String bamExtension = ".bam";
+ File indexFile;
+ final String fileName = dataFile.getName();
+ if (fileName.endsWith(bamExtension)) {
+ final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
+ indexFile = new File(dataFile.getParent(), bai);
+ if (indexFile.exists()) {
+ return indexFile;
+ }
+ }
+
+ // If foo.bai doesn't exist look for foo.bam.bai
+ indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
+ if (indexFile.exists()) {
+ return indexFile;
+ } else {
+ return null;
+ }
+ }
+
+ private class BAMFileIndexIterator extends BAMFileIterator {
+
+ private long[] mFilePointers = null;
+ private int mFilePointerIndex = 0;
+ private long mFilePointerLimit = -1;
+
+ /**
+ * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
+ * @param filePointers the block / offset combination, stored in chunk format.
+ */
+ BAMFileIndexIterator(final long[] filePointers) {
+ super(false); // delay advance() until after construction
+ mFilePointers = filePointers;
+ advance();
+ }
+
+ SAMRecord getNextRecord()
+ throws IOException {
+ // Advance to next file block if necessary
+ while (mInputStream.getFilePointer() >= mFilePointerLimit) {
+ if (mFilePointers == null ||
+ mFilePointerIndex >= mFilePointers.length) {
+ return null;
+ }
+ final long startOffset = mFilePointers[mFilePointerIndex++];
+ final long endOffset = mFilePointers[mFilePointerIndex++];
+ mInputStream.seek(startOffset);
+ mFilePointerLimit = endOffset;
+ }
+ // Pull next record from stream
+ return super.getNextRecord();
+ }
+ }
+
+ /**
+ * A decorating iterator that filters out records that are outside the bounds of the
+ * given query parameters.
+ */
+ private class BAMQueryFilteringIterator implements CloseableIterator {
+ /**
+ * The wrapped iterator.
+ */
+ private final CloseableIterator wrappedIterator;
+
+ /**
+ * The next record to be returned. Will be null if no such record exists.
+ */
+ private SAMRecord mNextRecord;
+
+ private final int mReferenceIndex;
+ private final int mRegionStart;
+ private final int mRegionEnd;
+ private final QueryType mQueryType;
+
+ public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) {
+ this.wrappedIterator = iterator;
+ final SAMFileHeader fileHeader = getFileHeader();
+ mReferenceIndex = fileHeader.getSequenceIndex(sequence);
+ mRegionStart = start;
+ if (queryType == QueryType.STARTING_AT) {
+ mRegionEnd = mRegionStart;
+ } else {
+ mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
+ }
+ mQueryType = queryType;
+ mNextRecord = advance();
+ }
+
+ /**
+ * Returns true if a next element exists; false otherwise.
+ */
+ public boolean hasNext() {
+ return mNextRecord != null;
+ }
+
+ /**
+ * Gets the next record from the given iterator.
+ * @return The next SAM record in the iterator.
+ */
+ public SAMRecord next() {
+ if(!hasNext())
+ throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
+ final SAMRecord currentRead = mNextRecord;
+ mNextRecord = advance();
+ return currentRead;
+ }
+
+ /**
+ * Closes down the existing iterator.
+ */
+ public void close() {
+ if (this != mCurrentIterator) {
+ throw new IllegalStateException("Attempt to close non-current iterator");
+ }
+ mCurrentIterator = null;
+ }
+
+ /**
+ * @throws UnsupportedOperationException always.
+ */
+ public void remove() {
+ throw new UnsupportedOperationException("Not supported: remove");
+ }
+
+ SAMRecord advance() {
+ while (true) {
+ // Pull next record from stream
+ if(!wrappedIterator.hasNext())
+ return null;
+
+ final SAMRecord record = wrappedIterator.next();
+ // If beyond the end of this reference sequence, end iteration
+ final int referenceIndex = record.getReferenceIndex();
+ if (referenceIndex != mReferenceIndex) {
+ if (referenceIndex < 0 ||
+ referenceIndex > mReferenceIndex) {
+ return null;
+ }
+ // If before this reference sequence, continue
+ continue;
+ }
+ if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
+ // Quick exit to avoid expensive alignment end calculation
+ return record;
+ }
+ final int alignmentStart = record.getAlignmentStart();
+ // If read is unmapped but has a coordinate, return it if the coordinate is within
+ // the query region, regardless of whether the mapped mate will be returned.
+ final int alignmentEnd;
+ if (mQueryType == QueryType.STARTING_AT) {
+ alignmentEnd = -1;
+ } else {
+ alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
+ record.getAlignmentEnd(): alignmentStart);
+ }
+
+ if (alignmentStart > mRegionEnd) {
+ // If scanned beyond target region, end iteration
+ return null;
+ }
+ // Filter for overlap with region
+ if (mQueryType == QueryType.CONTAINED) {
+ if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
+ return record;
+ }
+ } else if (mQueryType == QueryType.OVERLAPPING) {
+ if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
+ return record;
+ }
+ } else {
+ if (alignmentStart == mRegionStart) {
+ return record;
+ }
+ }
+ }
+ }
+ }
+
+ private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
+ private BAMFileIndexUnmappedIterator() {
+ while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
+ advance();
+ }
+ }
+ }
+
+}
diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
index 623f46291e..4692c66711 100644
--- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
+++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
@@ -25,6 +25,7 @@
package net.sf.samtools;
import net.sf.picard.util.PeekableIterator;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Arrays;
@@ -47,6 +48,18 @@ public GATKBAMFileSpan() {
super();
}
+ /**
+ * Create a new GATKBAMFileSpan from an existing BAMFileSpan.
+ * @param sourceFileSpan
+ */
+ public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) {
+ if(!(sourceFileSpan instanceof BAMFileSpan))
+ throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead");
+ BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan;
+ for(Chunk chunk: sourceBAMFileSpan.getChunks())
+ add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk));
+ }
+
/**
* Convenience constructor to construct a BAM file span from
* a single chunk.
diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java
index f590809e20..5d349e72e6 100644
--- a/public/java/src/net/sf/samtools/GATKChunk.java
+++ b/public/java/src/net/sf/samtools/GATKChunk.java
@@ -69,6 +69,22 @@ public void setChunkEnd(final long value) {
super.setChunkEnd(value);
}
+ public long getBlockStart() {
+ return getChunkStart() >>> 16;
+ }
+
+ public int getBlockOffsetStart() {
+ return (int)(getChunkStart() & 0xFFFF);
+ }
+
+ public long getBlockEnd() {
+ return getChunkEnd() >>> 16;
+ }
+
+ public int getBlockOffsetEnd() {
+ return ((int)getChunkEnd() & 0xFFFF);
+ }
+
/**
* Computes an approximation of the uncompressed size of the
* chunk, in bytes. Can be used to determine relative weights
diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java
new file mode 100644
index 0000000000..d825c23d51
--- /dev/null
+++ b/public/java/src/net/sf/samtools/util/BAMInputStream.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package net.sf.samtools.util;
+
+import java.io.IOException;
+
+/**
+ * An input stream formulated for use reading BAM files. Supports
+ */
+public interface BAMInputStream {
+ /**
+ * Seek to the given position in the file. Note that pos is a special virtual file pointer,
+ * not an actual byte offset.
+ *
+ * @param pos virtual file pointer
+ */
+ public void seek(final long pos) throws IOException;
+
+ /**
+ * @return virtual file pointer that can be passed to seek() to return to the current position. This is
+ * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
+ * the two.
+ */
+ public long getFilePointer();
+
+ /**
+ * Determines whether or not the inflater will re-calculated the CRC on the decompressed data
+ * and check it against the value stored in the GZIP header. CRC checking is an expensive
+ * operation and should be used accordingly.
+ */
+ public void setCheckCrcs(final boolean check);
+
+ public int read() throws java.io.IOException;
+
+ public int read(byte[] bytes) throws java.io.IOException;
+
+ public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
+
+ public long skip(long l) throws java.io.IOException;
+
+ public int available() throws java.io.IOException;
+
+ public void close() throws java.io.IOException;
+
+ public void mark(int i);
+
+ public void reset() throws java.io.IOException;
+
+ public boolean markSupported();
+}
diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java
new file mode 100755
index 0000000000..fae2fc89b4
--- /dev/null
+++ b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java
@@ -0,0 +1,483 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package net.sf.samtools.util;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+import net.sf.samtools.FileTruncatedException;
+
+/*
+ * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
+ * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
+ * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
+ * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
+ *
+ * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
+ */
+public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
+ private InputStream mStream = null;
+ private SeekableStream mFile = null;
+ private byte[] mFileBuffer = null;
+ private byte[] mCurrentBlock = null;
+ private int mCurrentOffset = 0;
+ private long mBlockAddress = 0;
+ private int mLastBlockLength = 0;
+ private final BlockGunzipper blockGunzipper = new BlockGunzipper();
+
+
+ /**
+ * Note that seek() is not supported if this ctor is used.
+ */
+ public BlockCompressedInputStream(final InputStream stream) {
+ mStream = IOUtil.toBufferedStream(stream);
+ mFile = null;
+ }
+
+ /**
+ * Use this ctor if you wish to call seek()
+ */
+ public BlockCompressedInputStream(final File file)
+ throws IOException {
+ mFile = new SeekableFileStream(file);
+ mStream = null;
+
+ }
+
+ public BlockCompressedInputStream(final URL url) {
+ mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
+ mStream = null;
+ }
+
+ /**
+ * For providing some arbitrary data source. No additional buffering is
+ * provided, so if the underlying source is not buffered, wrap it in a
+ * SeekableBufferedStream before passing to this ctor.
+ */
+ public BlockCompressedInputStream(final SeekableStream strm) {
+ mFile = strm;
+ mStream = null;
+ }
+
+ /**
+ * Determines whether or not the inflater will re-calculated the CRC on the decompressed data
+ * and check it against the value stored in the GZIP header. CRC checking is an expensive
+ * operation and should be used accordingly.
+ */
+ public void setCheckCrcs(final boolean check) {
+ this.blockGunzipper.setCheckCrcs(check);
+ }
+
+ /**
+ * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
+ * next caller of a method for this input stream. The next caller might be the same thread or another thread.
+ * Note that although the next caller can read this many bytes without blocking, the available() method call itself
+ * may block in order to fill an internal buffer if it has been exhausted.
+ */
+ public int available()
+ throws IOException {
+ if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
+ readBlock();
+ }
+ if (mCurrentBlock == null) {
+ return 0;
+ }
+ return mCurrentBlock.length - mCurrentOffset;
+ }
+
+ /**
+ * Closes the underlying InputStream or RandomAccessFile
+ */
+ public void close()
+ throws IOException {
+ if (mFile != null) {
+ mFile.close();
+ mFile = null;
+ } else if (mStream != null) {
+ mStream.close();
+ mStream = null;
+ }
+ // Encourage garbage collection
+ mFileBuffer = null;
+ mCurrentBlock = null;
+ }
+
+ /**
+ * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
+ * If no byte is available because the end of the stream has been reached, the value -1 is returned.
+ * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
+
+ * @return the next byte of data, or -1 if the end of the stream is reached.
+ */
+ public int read()
+ throws IOException {
+ return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
+ }
+
+ /**
+ * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
+ * actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
+ * or an exception is thrown.
+ *
+ * read(buf) has the same effect as read(buf, 0, buf.length).
+ *
+ * @param buffer the buffer into which the data is read.
+ * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
+ * the stream has been reached.
+ */
+ public int read(final byte[] buffer)
+ throws IOException {
+ return read(buffer, 0, buffer.length);
+ }
+
+ private volatile ByteArrayOutputStream buf = null;
+ private static final byte eol = '\n';
+ private static final byte eolCr = '\r';
+
+ /**
+ * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
+ * carriage return ('\r') or carriage return followed by a line feed ("\r\n").
+ *
+ * @return A String containing the contents of the line, excluding the line terminating
+ * character, or null if the end of the stream has been reached
+ *
+ * @exception IOException If an I/O error occurs
+ * @
+ */
+ public String readLine() throws IOException {
+ int available = available();
+ if (available == 0) {
+ return null;
+ }
+ if(null == buf){ // lazy initialisation
+ buf = new ByteArrayOutputStream(8192);
+ }
+ buf.reset();
+ boolean done = false;
+ boolean foundCr = false; // \r found flag
+ while (!done) {
+ int linetmpPos = mCurrentOffset;
+ int bCnt = 0;
+ while((available-- > 0)){
+ final byte c = mCurrentBlock[linetmpPos++];
+ if(c == eol){ // found \n
+ done = true;
+ break;
+ } else if(foundCr){ // previous char was \r
+ --linetmpPos; // current char is not \n so put it back
+ done = true;
+ break;
+ } else if(c == eolCr){ // found \r
+ foundCr = true;
+ continue; // no ++bCnt
+ }
+ ++bCnt;
+ }
+ if(mCurrentOffset < linetmpPos){
+ buf.write(mCurrentBlock, mCurrentOffset, bCnt);
+ mCurrentOffset = linetmpPos;
+ }
+ available = available();
+ if(available == 0){
+ // EOF
+ done = true;
+ }
+ }
+ return buf.toString();
+ }
+
+ /**
+ * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
+ * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
+ *
+ * This method blocks until input data is available, end of file is detected, or an exception is thrown.
+ *
+ * @param buffer buffer into which data is read.
+ * @param offset the start offset in array b at which the data is written.
+ * @param length the maximum number of bytes to read.
+ * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
+ * the stream has been reached.
+ */
+ public int read(final byte[] buffer, int offset, int length)
+ throws IOException {
+ final int originalLength = length;
+ while (length > 0) {
+ final int available = available();
+ if (available == 0) {
+ // Signal EOF to caller
+ if (originalLength == length) {
+ return -1;
+ }
+ break;
+ }
+ final int copyLength = Math.min(length, available);
+ System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
+ mCurrentOffset += copyLength;
+ offset += copyLength;
+ length -= copyLength;
+ }
+ return originalLength - length;
+ }
+
+ /**
+ * Seek to the given position in the file. Note that pos is a special virtual file pointer,
+ * not an actual byte offset.
+ *
+ * @param pos virtual file pointer
+ */
+ public void seek(final long pos)
+ throws IOException {
+ if (mFile == null) {
+ throw new IOException("Cannot seek on stream based file");
+ }
+ // Decode virtual file pointer
+ // Upper 48 bits is the byte offset into the compressed stream of a block.
+ // Lower 16 bits is the byte offset into the uncompressed stream inside the block.
+ final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
+ final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
+ final int available;
+ if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
+ available = mCurrentBlock.length;
+ } else {
+ mFile.seek(compressedOffset);
+ mBlockAddress = compressedOffset;
+ mLastBlockLength = 0;
+ readBlock();
+ available = available();
+ }
+ if (uncompressedOffset > available ||
+ (uncompressedOffset == available && !eof())) {
+ throw new IOException("Invalid file pointer: " + pos);
+ }
+ mCurrentOffset = uncompressedOffset;
+ }
+
+ private boolean eof() throws IOException {
+ if (mFile.eof()) {
+ return true;
+ }
+ // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
+ return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
+ }
+
+ /**
+ * @return virtual file pointer that can be passed to seek() to return to the current position. This is
+ * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
+ * the two.
+ */
+ public long getFilePointer() {
+ if (mCurrentOffset == mCurrentBlock.length) {
+ // If current offset is at the end of the current block, file pointer should point
+ // to the beginning of the next block.
+ return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
+ }
+ return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
+ }
+
+ public static long getFileBlock(final long bgzfOffset) {
+ return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
+ }
+
+ /**
+ * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
+ * @return true if the given file looks like a valid BGZF file.
+ */
+ public static boolean isValidFile(final InputStream stream)
+ throws IOException {
+ if (!stream.markSupported()) {
+ throw new RuntimeException("Cannot test non-buffered stream");
+ }
+ stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
+ final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ stream.reset();
+ return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
+ }
+
+ private static boolean isValidBlockHeader(final byte[] buffer) {
+ return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
+ (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
+ (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
+ buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
+ buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
+ buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
+ }
+
+ private void readBlock()
+ throws IOException {
+
+ if (mFileBuffer == null) {
+ mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
+ }
+ int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ if (count == 0) {
+ // Handle case where there is no empty gzip block at end.
+ mCurrentOffset = 0;
+ mBlockAddress += mLastBlockLength;
+ mCurrentBlock = new byte[0];
+ return;
+ }
+ if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
+ throw new IOException("Premature end of file");
+ }
+ final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
+ if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
+ throw new IOException("Unexpected compressed block length: " + blockLength);
+ }
+ final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
+ count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
+ if (count != remaining) {
+ throw new FileTruncatedException("Premature end of file");
+ }
+ inflateBlock(mFileBuffer, blockLength);
+ mCurrentOffset = 0;
+ mBlockAddress += mLastBlockLength;
+ mLastBlockLength = blockLength;
+ }
+
+ private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
+ throws IOException {
+ final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
+ byte[] buffer = mCurrentBlock;
+ mCurrentBlock = null;
+ if (buffer == null || buffer.length != uncompressedLength) {
+ try {
+ buffer = new byte[uncompressedLength];
+ } catch (NegativeArraySizeException e) {
+ throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
+ }
+ }
+ blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
+ mCurrentBlock = buffer;
+ }
+
+ private int readBytes(final byte[] buffer, final int offset, final int length)
+ throws IOException {
+ if (mFile != null) {
+ return readBytes(mFile, buffer, offset, length);
+ } else if (mStream != null) {
+ return readBytes(mStream, buffer, offset, length);
+ } else {
+ return 0;
+ }
+ }
+
+ private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
+ throws IOException {
+ int bytesRead = 0;
+ while (bytesRead < length) {
+ final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
+ if (count <= 0) {
+ break;
+ }
+ bytesRead += count;
+ }
+ return bytesRead;
+ }
+
+ private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
+ throws IOException {
+ int bytesRead = 0;
+ while (bytesRead < length) {
+ final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
+ if (count <= 0) {
+ break;
+ }
+ bytesRead += count;
+ }
+ return bytesRead;
+ }
+
+ private int unpackInt16(final byte[] buffer, final int offset) {
+ return ((buffer[offset] & 0xFF) |
+ ((buffer[offset+1] & 0xFF) << 8));
+ }
+
+ private int unpackInt32(final byte[] buffer, final int offset) {
+ return ((buffer[offset] & 0xFF) |
+ ((buffer[offset+1] & 0xFF) << 8) |
+ ((buffer[offset+2] & 0xFF) << 16) |
+ ((buffer[offset+3] & 0xFF) << 24));
+ }
+
+ public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
+
+ public static FileTermination checkTermination(final File file)
+ throws IOException {
+ final long fileSize = file.length();
+ if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
+ return FileTermination.DEFECTIVE;
+ }
+ final RandomAccessFile raFile = new RandomAccessFile(file, "r");
+ try {
+ raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
+ byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
+ raFile.readFully(buf);
+ if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
+ return FileTermination.HAS_TERMINATOR_BLOCK;
+ }
+ final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
+ buf = new byte[bufsize];
+ raFile.seek(fileSize - bufsize);
+ raFile.read(buf);
+ for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
+ i >= 0; --i) {
+ if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
+ buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
+ continue;
+ }
+ final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
+ byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
+ if (buf.length - i == totalBlockSizeMinusOne + 1) {
+ return FileTermination.HAS_HEALTHY_LAST_BLOCK;
+ } else {
+ return FileTermination.DEFECTIVE;
+ }
+ }
+ return FileTermination.DEFECTIVE;
+ } finally {
+ raFile.close();
+ }
+ }
+
+ private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
+ for (int i = 0; i < length; ++i) {
+ if (preamble[i] != buf[i + startOffset]) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
+
+
diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
index bed1e710e6..9e1be5bca1 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
@@ -331,12 +331,12 @@ private static void errorPrintf(String format, Object... s) {
* used to indicate an error occured
*
* @param msg the message
- * @param e the error
+ * @param t the error
*/
- public static void exitSystemWithError(String msg, final Exception e) {
+ public static void exitSystemWithError(String msg, final Throwable t) {
errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("stack trace %n");
- e.printStackTrace();
+ t.printStackTrace();
errorPrintf("------------------------------------------------------------------------------------------%n");
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
@@ -392,10 +392,10 @@ public static void exitSystemWithSamError(final Exception e) {
/**
* used to indicate an error occured
*
- * @param e the exception occured
+ * @param t the exception that occurred
*/
- public static void exitSystemWithError(Exception e) {
- exitSystemWithError(e.getMessage(), e);
+ public static void exitSystemWithError(Throwable t) {
+ exitSystemWithError(t.getMessage(), t);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
index 86ca6c2dfb..9e2c9a8183 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
@@ -45,7 +45,7 @@
*
* The IntervalBinding is a formal GATK argument that bridges between a walker and
* the engine to construct intervals for traversal at runtime. The IntervalBinding can
- * either be a RodBinding, a string of one or more intervals, or a file with interval strings.
+ * either be a RodBinding, a string of one interval, or a file with interval strings.
* The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it.
*
* Note that this class is immutable.
@@ -92,7 +92,10 @@ public List getIntervals(GenomeAnalysisEngine toolkit) {
codec.readHeader(lineReader);
String line = lineReader.readLine();
while ( line != null ) {
- intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(codec.decodeLoc(line)));
+ final Feature feature = codec.decodeLoc(line);
+ if ( feature == null )
+ throw new UserException.MalformedFile(featureIntervals.getSource(), "Couldn't parse line '" + line + "'");
+ intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(feature));
line = lineReader.readLine();
}
} catch (IOException e) {
@@ -105,4 +108,8 @@ public List getIntervals(GenomeAnalysisEngine toolkit) {
return intervals;
}
+
+ public String toString() {
+ return getSource();
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index b8488dc9a1..b4d337d8df 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -30,7 +30,6 @@
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
-import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.gatk.walkers.Attribution;
import org.broadinstitute.sting.gatk.walkers.Walker;
@@ -97,13 +96,20 @@ public static void main(String[] argv) {
// lazy loaded, so they aren't caught elsewhere and made into User Exceptions
exitSystemWithUserError(e);
} catch (net.sf.samtools.SAMException e) {
- // Let's try this out and see how it is received by our users
+ checkForTooManyOpenFilesProblem(e.getMessage());
exitSystemWithSamError(e);
- } catch (Exception e) {
- exitSystemWithError(e);
+ } catch (Throwable t) {
+ checkForTooManyOpenFilesProblem(t.getMessage());
+ exitSystemWithError(t);
}
}
+ private static void checkForTooManyOpenFilesProblem(String message) {
+ // Special case the "Too many open files" error because it's a common User Error for which we know what to do
+ if ( message != null && message.indexOf("Too many open files") != -1 )
+ exitSystemWithUserError(new UserException.TooManyOpenFiles());
+ }
+
/**
* Creates the a short blurb about the GATK, copyright info, and where to get documentation.
*
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index f8e87aa586..f954d76501 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -35,6 +35,7 @@
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
@@ -126,6 +127,11 @@ public void setIntervals( GenomeLocSortedSet intervals ) {
*/
private Collection filters;
+ /**
+ * Controls the allocation of threads between CPU vs IO.
+ */
+ private ThreadAllocation threadAllocation;
+
/**
* A currently hacky unique name for this GATK instance
*/
@@ -199,6 +205,9 @@ public Object execute() {
if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis());
+ // Determine how the threads should be divided between CPU vs. IO.
+ determineThreadAllocation();
+
// Prepare the data for traversal.
initializeDataSources();
@@ -218,7 +227,7 @@ public Object execute() {
// create the output streams "
initializeOutputStreams(microScheduler.getOutputTracker());
- ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
+ Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
// execute the microscheduler, storing the results
return microScheduler.execute(this.walker, shardStrategy);
@@ -266,6 +275,32 @@ public Collection createFilters() {
return Collections.unmodifiableList(filters);
}
+ /**
+ * Parse out the thread allocation from the given command-line argument.
+ */
+ private void determineThreadAllocation() {
+ Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
+
+ // TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
+ Integer numCPUThreads = null;
+ if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null)
+ throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
+ else if(tags.containsKey("cpu"))
+ numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
+ else if(argCollection.numberOfCPUThreads != null)
+ numCPUThreads = argCollection.numberOfCPUThreads;
+
+ Integer numIOThreads = null;
+ if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
+ throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
+ else if(tags.containsKey("io"))
+ numIOThreads = Integer.parseInt(tags.getValue("io"));
+ else if(argCollection.numberOfIOThreads != null)
+ numIOThreads = argCollection.numberOfIOThreads;
+
+ this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
+ }
+
/**
* Allow subclasses and others within this package direct access to the walker manager.
* @return The walker manager used by this package.
@@ -286,7 +321,7 @@ private MicroScheduler createMicroscheduler() {
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
}
- return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads);
+ return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation);
}
protected DownsamplingMethod getDownsamplingMethod() {
@@ -397,103 +432,52 @@ protected void validateSuppliedIntervals() {
* @param intervals intervals
* @return the sharding strategy
*/
- protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
+ protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
- // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
- // sharding system; it's required with the new sharding system only for locus walkers.
- if(readsDataSource != null && !readsDataSource.hasIndex() ) {
- if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
+
+ // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
+ if(!readsDataSource.isEmpty()) {
+ if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
- if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
+ if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
- Shard.ShardType shardType;
if(walker instanceof LocusWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
- shardType = Shard.ShardType.LOCUS;
+ if(intervals == null)
+ return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
+ else
+ return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
}
- else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker)
- shardType = Shard.ShardType.READ;
- else
- throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files");
-
- List region;
- if(intervals != null)
- region = intervals.toList();
- else {
- region = new ArrayList();
- for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences())
- region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength()));
+ else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
+ // Apply special validation to read pair walkers.
+ if(walker instanceof ReadPairWalker) {
+ if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
+ throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
+ if(intervals != null && !intervals.isEmpty())
+ throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
+ }
+
+ if(intervals == null)
+ return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
+ else
+ return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
}
-
- return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
+ else
+ throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
+ }
+ else {
+ // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well
+ // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard
+ // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB]
+ final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000;
+ if(intervals == null)
+ return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
+ else
+ return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
}
-
- ShardStrategy shardStrategy;
- ShardStrategyFactory.SHATTER_STRATEGY shardType;
-
- long SHARD_SIZE = 100000L;
-
- if (walker instanceof LocusWalker) {
- if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
-
- if (intervals != null && !intervals.isEmpty()) {
- if (readsDataSource == null)
- throw new IllegalArgumentException("readsDataSource is null");
- if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
- throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
-
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser(),
- intervals);
- } else
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,getGenomeLocParser());
- } else if (walker instanceof ReadWalker ||
- walker instanceof DuplicateWalker) {
- shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
-
- if (intervals != null && !intervals.isEmpty()) {
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- shardType,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser(),
- intervals);
- } else {
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- shardType,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser());
- }
- } else if (walker instanceof ReadPairWalker) {
- if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
- throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file.");
- if(intervals != null && !intervals.isEmpty())
- throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
-
- shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
- referenceDataSource.getReference(),
- ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
- drivingDataSource.getSequenceDictionary(),
- SHARD_SIZE,
- getGenomeLocParser());
- } else
- throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName());
-
- return shardStrategy;
}
protected boolean flashbackData() {
@@ -751,6 +735,8 @@ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection
return new SAMDataSource(
samReaderIDs,
+ threadAllocation,
+ argCollection.numberOfBAMFileHandles,
genomeLocParser,
argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel,
@@ -763,8 +749,7 @@ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
- argCollection.defaultBaseQualities,
- !argCollection.disableLowMemorySharding);
+ argCollection.defaultBaseQualities);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
index 93fa2d146c..daa8ff60db 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
@@ -30,7 +30,6 @@ public class ReadProperties {
private Collection readers = null;
private SAMFileHeader header = null;
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT;
- private Integer readBufferSize = null;
private DownsamplingMethod downsamplingMethod = null;
private ValidationExclusion exclusionList = null;
private Collection supplementalFilters = null;
@@ -91,14 +90,6 @@ public SAMFileReader.ValidationStringency getValidationStringency() {
return validationStringency;
}
- /**
- * Gets a list of the total number of reads that the sharding system should buffer per BAM file.
- * @return
- */
- public Integer getReadBufferSize() {
- return readBufferSize;
- }
-
/**
* Gets the method and parameters used when downsampling reads.
* @return Downsample fraction.
@@ -150,7 +141,6 @@ public byte defaultBaseQualities() {
* @param header sam file header.
* @param useOriginalBaseQualities True if original base qualities should be used.
* @param strictness Stringency of reads file parsing.
- * @param readBufferSize Number of reads to hold in memory per BAM.
* @param downsamplingMethod Method for downsampling reads at a given locus.
* @param exclusionList what safety checks we're willing to let slide
* @param supplementalFilters additional filters to dynamically apply.
@@ -169,7 +159,6 @@ public ReadProperties( Collection samFiles,
SAMFileHeader header,
boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness,
- Integer readBufferSize,
DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList,
Collection supplementalFilters,
@@ -181,7 +170,6 @@ public ReadProperties( Collection samFiles,
byte defaultBaseQualities) {
this.readers = samFiles;
this.header = header;
- this.readBufferSize = readBufferSize;
this.validationStringency = strictness;
this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod;
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 8078a1ea42..08d2c1ad15 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -194,10 +194,25 @@ public static DownsamplingMethod getDefaultDownsamplingMethod() {
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
- @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
- public int numberOfThreads = 1;
+ /** How many threads should be allocated to this analysis. */
+ @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
+ public Integer numberOfThreads = 1;
- @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false)
+ /**
+ * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types.
+ * TODO: Kill this when I can do a tagged integer in Queue.
+ */
+ @Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false)
+ @Hidden
+ public Integer numberOfCPUThreads = null;
+ @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
+ @Hidden
+ public Integer numberOfIOThreads = null;
+
+ @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
+ public Integer numberOfBAMFileHandles = null;
+
+ @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false)
public List readGroupBlackList = null;
// --------------------------------------------------------------------------------------------------------------
@@ -292,9 +307,6 @@ public static DownsamplingMethod getDefaultDownsamplingMethod() {
@Hidden
public boolean allowIntervalsWithUnindexedBAM = false;
- @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
- public boolean disableLowMemorySharding = false;
-
// --------------------------------------------------------------------------------------------------------------
//
// methods
@@ -365,7 +377,19 @@ public boolean equals(GATKArgumentCollection other) {
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
return false;
}
- if (other.numberOfThreads != this.numberOfThreads) {
+ if (!other.numberOfThreads.equals(this.numberOfThreads)) {
+ return false;
+ }
+ if ((this.numberOfCPUThreads == null && other.numberOfCPUThreads != null) ||
+ this.numberOfCPUThreads.equals(other.numberOfCPUThreads) ) {
+ return false;
+ }
+ if ((this.numberOfIOThreads == null && other.numberOfIOThreads != null) ||
+ this.numberOfIOThreads.equals(other.numberOfIOThreads) ) {
+ return false;
+ }
+ if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
+ (other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
return false;
}
if (other.intervalMerging != this.intervalMerging) {
@@ -389,9 +413,6 @@ public boolean equals(GATKArgumentCollection other) {
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
return false;
- if (disableLowMemorySharding != other.disableLowMemorySharding)
- return false;
-
return true;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
index 4e75f3ddbd..d589f90292 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
@@ -131,7 +131,7 @@ public static Map splitContextByReadGroup(
}
}
- public static Map splitContextBySampleName(ReadBackedPileup pileup, String assumedSingleSample) {
+ public static Map splitContextBySampleName(ReadBackedPileup pileup) {
return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup));
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java
index c38b093343..54f8b44edc 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java
@@ -80,7 +80,7 @@ public RodLocusView( LocusShardDataProvider provider ) {
// grab the ROD iterator from the data source, and compute the first location in this shard, forwarding
// the iterator to immediately before it, so that it can be added to the merging iterator primed for
// next() to return the first real ROD in this shard
- LocationAwareSeekableRODIterator it = dataSource.seek(provider.getShard());
+ LocationAwareSeekableRODIterator it = dataSource.seek(provider.getLocus());
it.seekForward(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart()-1));
states.add(new ReferenceOrderedDataState(dataSource,it));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java
deleted file mode 100644
index de938e8453..0000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import org.broadinstitute.sting.utils.exceptions.StingException;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.channels.FileChannel;
-import java.util.Iterator;
-
-/**
- * Created by IntelliJ IDEA.
- * User: mhanna
- * Date: Feb 7, 2011
- * Time: 2:46:34 PM
- * To change this template use File | Settings | File Templates.
- */
-public class BAMBlockStartIterator implements Iterator {
- /**
- * How large is a BGZF header?
- */
- private static int BGZF_HEADER_SIZE = 18;
-
- /**
- * Where within the header does the BLOCKSIZE actually live?
- */
- private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2;
-
- private FileChannel bamInputChannel;
- private ByteBuffer headerByteBuffer;
-
- private long nextLocation = 0;
-
- public BAMBlockStartIterator(File bamFile) {
- try {
- FileInputStream bamInputStream = new FileInputStream(bamFile);
- bamInputChannel = bamInputStream.getChannel();
-
- headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE);
- headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
-
- }
- catch(IOException ex) {
- throw new StingException("Could not open file",ex);
- }
- }
-
- public boolean hasNext() {
- return nextLocation != -1;
- }
-
- public Long next() {
- long currentLocation = nextLocation;
- advance();
- return currentLocation;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator");
- }
-
- private void advance() {
- int readStatus;
-
- headerByteBuffer.clear();
- try {
- readStatus = bamInputChannel.read(headerByteBuffer);
- }
- catch(IOException ex) {
- throw new StingException("Could not read header data",ex);
- }
-
- if(readStatus == -1) {
- nextLocation = -1;
- try {
- bamInputChannel.close();
- }
- catch(IOException ex) {
- throw new StingException("Could not close input file",ex);
- }
- return;
- }
-
- headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION);
- int blockSize = headerByteBuffer.getShort();
-
- try {
- bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1);
- nextLocation = bamInputChannel.position();
- }
- catch(IOException ex) {
- throw new StingException("Could not reposition input stream",ex);
- }
- }
-
- public static void main(String argv[]) throws IOException {
- BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam"));
- int i = 0;
- while(blockStartIterator.hasNext())
- System.out.printf("%d -> %d%n",i++,blockStartIterator.next());
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java
deleted file mode 100644
index 4d91fb45f4..0000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.samtools.GATKBin;
-import net.sf.samtools.GATKChunk;
-import net.sf.samtools.LinearIndex;
-
-import java.util.*;
-
-/**
- * Represents the contents of a bam index file for one reference.
- * A BAM index (.bai) file contains information for all references in the bam file.
- * This class describes the data present in the index file for one of these references;
- * including the bins, chunks, and linear index.
- */
-class BAMIndexContent {
- /**
- * The reference sequence for the data currently loaded.
- */
- private final int mReferenceSequence;
-
- /**
- * A list of all bins in the above reference sequence.
- */
- private final BinList mBinList;
-
- /**
- * The linear index for the reference sequence above.
- */
- private final LinearIndex mLinearIndex;
-
-
- /**
- * @param referenceSequence Content corresponds to this reference.
- * @param bins Array of bins represented by this content, possibly sparse
- * @param numberOfBins Number of non-null bins
- * @param linearIndex Additional index used to optimize queries
- */
- BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) {
- this.mReferenceSequence = referenceSequence;
- this.mBinList = new BinList(bins, numberOfBins);
- this.mLinearIndex = linearIndex;
- }
-
- /**
- * Reference for this Content
- */
- public int getReferenceSequence() {
- return mReferenceSequence;
- }
-
- /**
- * Does this content have anything in this bin?
- */
- public boolean containsBin(final GATKBin bin) {
- return mBinList.getBin(bin.getBinNumber()) != null;
- }
-
- /**
- * @return iterable list of bins represented by this content
- */
- public BinList getBins() {
- return mBinList;
- }
-
- /**
- * @return the number of non-null bins represented by this content
- */
- int getNumberOfNonNullBins() {
- return mBinList.getNumberOfNonNullBins();
- }
-
- /**
- * @return all chunks associated with all bins in this content
- */
- public List getAllChunks() {
- List allChunks = new ArrayList();
- for (GATKBin b : mBinList)
- if (b.getChunkList() != null) {
- allChunks.addAll(Arrays.asList(b.getChunkList()));
- }
- return Collections.unmodifiableList(allChunks);
- }
-
- /**
- * @return the linear index represented by this content
- */
- public LinearIndex getLinearIndex() {
- return mLinearIndex;
- }
-
- /**
- * This class is used to encapsulate the list of Bins store in the BAMIndexContent
- * While it is currently represented as an array, we may decide to change it to an ArrayList or other structure
- */
- class BinList implements Iterable {
-
- private final GATKBin[] mBinArray;
- public final int numberOfNonNullBins;
- public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based
-
- /**
- * @param binArray a sparse array representation of the bins. The index into the array is the bin number.
- * @param numberOfNonNullBins
- */
- BinList(GATKBin[] binArray, int numberOfNonNullBins) {
- this.mBinArray = binArray;
- this.numberOfNonNullBins = numberOfNonNullBins;
- this.maxBinNumber = mBinArray.length - 1;
- }
-
- GATKBin getBin(int binNumber) {
- if (binNumber > maxBinNumber) return null;
- return mBinArray[binNumber];
- }
-
- int getNumberOfNonNullBins() {
- return numberOfNonNullBins;
- }
-
- /**
- * Gets an iterator over all non-null bins.
- *
- * @return An iterator over all bins.
- */
- public Iterator iterator() {
- return new BinIterator();
- }
-
- private class BinIterator implements Iterator {
- /**
- * Stores the bin # of the Bin currently in use.
- */
- private int nextBin;
-
- public BinIterator() {
- nextBin = 0;
- }
-
- /**
- * Are there more bins in this set, waiting to be returned?
- *
- * @return True if more bins are remaining.
- */
- public boolean hasNext() {
- while (nextBin <= maxBinNumber) {
- if (getBin(nextBin) != null) return true;
- nextBin++;
- }
- return false;
- }
-
- /**
- * Gets the next bin in the provided BinList.
- *
- * @return the next available bin in the BinList.
- */
- public GATKBin next() {
- if (!hasNext())
- throw new NoSuchElementException("This BinIterator is currently empty");
- GATKBin result = getBin(nextBin);
- nextBin++;
- return result;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Unable to remove from a bin iterator");
- }
- }
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java
deleted file mode 100644
index 15a372ca67..0000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMOverlap.java
+++ /dev/null
@@ -1,29 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.samtools.Bin;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Models a bin at which all BAM files in the merged input stream overlap.
- */
-class BAMOverlap {
- public final int start;
- public final int stop;
-
- private final Map bins = new HashMap();
-
- public BAMOverlap(final int start, final int stop) {
- this.start = start;
- this.stop = stop;
- }
-
- public void addBin(final SAMReaderID id, final Bin bin) {
- bins.put(id,bin);
- }
-
- public Bin getBin(final SAMReaderID id) {
- return bins.get(id);
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
index 521bcd5a3d..762722fcd0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
@@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator {
/**
* Create a new BAM schedule based on the given index.
- * @param indexFiles Index files.
+ * @param dataSource The SAM data source to use.
* @param intervals List of
*/
- public BAMSchedule(final Map indexFiles, final List intervals) {
+ public BAMSchedule(final SAMDataSource dataSource, final List intervals) {
if(intervals.isEmpty())
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
- referenceSequence = intervals.get(0).getContigIndex();
+ referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();
createScheduleFile();
- readerIDs.addAll(indexFiles.keySet());
+ readerIDs.addAll(dataSource.getReaderIDs());
for(final SAMReaderID reader: readerIDs) {
- final GATKBAMIndex index = indexFiles.get(reader);
+ final GATKBAMIndex index = dataSource.getIndex(reader);
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
@@ -237,7 +237,10 @@ private void advance() {
if(selectedIterators.isEmpty())
return;
+ // Create the target schedule entry
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
+
+ // For each schedule entry with data, load the data into the merged schedule.
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
PeekableIterator scheduleIterator = scheduleIterators.get(reader);
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
@@ -248,6 +251,11 @@ private void advance() {
scheduleIterator.next();
}
+ // For each schedule entry without data, add a blank entry.
+ for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
+ mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
+ }
+
nextScheduleEntry = mergedScheduleEntry;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index 47eb55b288..dca4cc7710 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -27,7 +27,12 @@
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileSpan;
+import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.*;
@@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator {
private FilePointer nextFilePointer = null;
- private final GenomeLocSortedSet loci;
+ private GenomeLocSortedSet loci;
+ private PeekableIterator locusIterator;
+ private GenomeLoc currentLocus;
+
+ public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
+ BAMScheduler scheduler = new BAMScheduler(dataSource);
+ GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
+ for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
+ // Match only on sequence name; trust startup validation to make sure all the sequences match.
+ if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
+ intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
+ }
+ scheduler.populateFilteredIntervalList(intervals);
+ return scheduler;
+ }
- private final PeekableIterator locusIterator;
+ public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
+ BAMScheduler scheduler = new BAMScheduler(dataSource);
+ scheduler.populateUnfilteredIntervalList(parser);
+ return scheduler;
+ }
+
+ public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
+ BAMScheduler scheduler = new BAMScheduler(dataSource);
+ scheduler.populateFilteredIntervalList(loci);
+ return scheduler;
+ }
- private GenomeLoc currentLocus;
- public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
+ private BAMScheduler(final SAMDataSource dataSource) {
this.dataSource = dataSource;
- for(SAMReaderID reader: dataSource.getReaderIDs())
- indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
+ for(SAMReaderID reader: dataSource.getReaderIDs()) {
+ GATKBAMIndex index = dataSource.getIndex(reader);
+ if(index != null)
+ indexFiles.put(reader,dataSource.getIndex(reader));
+ }
+ }
+
+ /**
+ * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations.
+ * @param loci The list of locations to search and iterate over.
+ */
+ private void populateFilteredIntervalList(final GenomeLocSortedSet loci) {
this.loci = loci;
- locusIterator = new PeekableIterator(loci.iterator());
- if(locusIterator.hasNext())
- currentLocus = locusIterator.next();
- advance();
+ if(!indexFiles.isEmpty()) {
+ // If index data is available, start up the iterator.
+ locusIterator = new PeekableIterator(loci.iterator());
+ if(locusIterator.hasNext())
+ currentLocus = locusIterator.next();
+ advance();
+ }
+ else {
+ // Otherwise, seed the iterator with a single file pointer over the entire region.
+ nextFilePointer = generatePointerOverEntireFileset();
+ for(GenomeLoc locus: loci)
+ nextFilePointer.addLocation(locus);
+ locusIterator = new PeekableIterator(Collections.emptyList().iterator());
+ }
+ }
+
+ /**
+ * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching
+ * from just before the start of the region to the end of the region.
+ */
+ private void populateUnfilteredIntervalList(final GenomeLocParser parser) {
+ this.loci = new GenomeLocSortedSet(parser);
+ locusIterator = new PeekableIterator(Collections.emptyList().iterator());
+ nextFilePointer = generatePointerOverEntireFileset();
+ }
+
+ /**
+ * Generate a span that runs from the end of the BAM header to the end of the fle.
+ * @return A file pointer over the specified region.
+ */
+ private FilePointer generatePointerOverEntireFileset() {
+ FilePointer filePointer = new FilePointer();
+ Map currentPosition = dataSource.getCurrentPosition();
+ for(SAMReaderID reader: dataSource.getReaderIDs())
+ filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
+ return filePointer;
}
public boolean hasNext() {
@@ -67,7 +137,9 @@ public FilePointer next() {
if(!hasNext())
throw new NoSuchElementException("No next element available in interval sharder");
FilePointer currentFilePointer = nextFilePointer;
+ nextFilePointer = null;
advance();
+
return currentFilePointer;
}
@@ -79,13 +151,12 @@ private void advance() {
if(loci.isEmpty())
return;
- nextFilePointer = null;
while(nextFilePointer == null && currentLocus != null) {
// special case handling of the unmapped shard.
if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
- nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
+ nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
currentLocus = null;
continue;
}
@@ -96,7 +167,7 @@ private void advance() {
int coveredRegionStop = Integer.MAX_VALUE;
GenomeLoc coveredRegion = null;
- BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
+ BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus);
// No overlapping data at all.
if(scheduleEntry != null) {
@@ -108,7 +179,6 @@ private void advance() {
}
else {
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
- //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
for(SAMReaderID reader: indexFiles.keySet())
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
}
@@ -116,21 +186,13 @@ private void advance() {
// Early exit if no bins were found.
if(coveredRegion == null) {
// for debugging only: maximum split is 16384.
- if(currentLocus.size() > 16384) {
- GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
- nextFilePointer.addLocation(splitContigs[0]);
- currentLocus = splitContigs[1];
- }
- else {
- nextFilePointer.addLocation(currentLocus);
- currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
- }
+ nextFilePointer.addLocation(currentLocus);
+ currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
continue;
}
// Early exit if only part of the first interval was found.
if(currentLocus.startsBefore(coveredRegion)) {
- // for debugging only: maximum split is 16384.
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
nextFilePointer.addLocation(splitContigs[0]);
@@ -175,25 +237,30 @@ else if(locusIterator.hasNext())
/**
* Get the next overlapping tree of bins associated with the given BAM file.
- * @param indices BAM indices.
* @param currentLocus The actual locus for which to check overlap.
* @return The next schedule entry overlapping with the given list of loci.
*/
- private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) {
+ private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) {
+ // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name.
+ // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
+ // we'll be using the correct contig index for the BAMs.
+ // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
+ final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
+
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
- if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
+ if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
if(bamScheduleIterator != null)
bamScheduleIterator.close();
- lastReferenceSequenceLoaded = currentLocus.getContigIndex();
+ lastReferenceSequenceLoaded = currentContigIndex;
// Naive algorithm: find all elements in current contig for proper schedule creation.
List lociInContig = new LinkedList();
for(GenomeLoc locus: loci) {
- if(locus.getContigIndex() == lastReferenceSequenceLoaded)
+ if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
lociInContig.add(locus);
}
- bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig));
+ bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig));
}
if(!bamScheduleIterator.hasNext())
@@ -209,4 +276,13 @@ private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map inputQueue;
+
+ public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
+ threadPool = Executors.newFixedThreadPool(numThreads);
+ fileHandleCache = new FileHandleCache(numFileHandles);
+ inputQueue = new LinkedList();
+
+ threadPool.execute(new BlockLoader(this,fileHandleCache,true));
+ }
+
+ /**
+ * Initiates a request for a new block load.
+ * @param readerPosition Position at which to load.
+ */
+ void queueBlockLoad(final SAMReaderPosition readerPosition) {
+ synchronized(inputQueue) {
+ inputQueue.add(readerPosition);
+ inputQueue.notify();
+ }
+ }
+
+ /**
+ * Claims the next work request from the queue.
+ * @return The next work request, or null if none is available.
+ */
+ SAMReaderPosition claimNextWorkRequest() {
+ synchronized(inputQueue) {
+ while(inputQueue.isEmpty()) {
+ try {
+ inputQueue.wait();
+ }
+ catch(InterruptedException ex) {
+ throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item");
+ }
+ }
+ return inputQueue.poll();
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java
new file mode 100644
index 0000000000..e377f865df
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.GATKChunk;
+import net.sf.samtools.util.BAMInputStream;
+import net.sf.samtools.util.BlockCompressedFilePointerUtil;
+import net.sf.samtools.util.BlockCompressedInputStream;
+import net.sf.samtools.util.RuntimeEOFException;
+import net.sf.samtools.util.SeekableStream;
+import org.broad.tribble.util.BlockCompressedStreamConstants;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+import java.util.LinkedList;
+
+/**
+ * Presents decompressed blocks to the SAMFileReader.
+ */
+public class BlockInputStream extends SeekableStream implements BAMInputStream {
+ /**
+ * Mechanism for triggering block loads.
+ */
+ private final BGZFBlockLoadingDispatcher dispatcher;
+
+ /**
+ * The reader whose data is supplied by this input stream.
+ */
+ private final SAMReaderID reader;
+
+ /**
+ * Length of the input stream.
+ */
+ private final long length;
+
+ /**
+ * The latest error reported by an asynchronous block load.
+ */
+ private Throwable error;
+
+ /**
+ * Current position.
+ */
+ private SAMReaderPosition position;
+
+ /**
+ * A stream of compressed data blocks.
+ */
+ private final ByteBuffer buffer;
+
+ /**
+ * Offsets of the given blocks in the buffer.
+ */
+ private LinkedList blockOffsets = new LinkedList();
+
+ /**
+ * Source positions of the given blocks in the buffer.
+ */
+ private LinkedList blockPositions = new LinkedList();
+
+ /**
+ * Provides a lock to wait for more data to arrive.
+ */
+ private final Object lock = new Object();
+
+ /**
+ * An input stream to use when comparing data back to what it should look like.
+ */
+ private final BlockCompressedInputStream validatingInputStream;
+
+ /**
+ * Has the buffer been filled since last request?
+ */
+ private boolean bufferFilled = false;
+
+ /**
+ * Create a new block presenting input stream with a dedicated buffer.
+ * @param dispatcher the block loading messenger.
+ * @param reader the reader for which to load data.
+ * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream.
+ */
+ BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) {
+ this.reader = reader;
+ this.length = reader.samFile.length();
+
+ buffer = ByteBuffer.wrap(new byte[64*1024]);
+ buffer.order(ByteOrder.LITTLE_ENDIAN);
+
+ // The state of the buffer assumes that the range of data written into the buffer appears in the range
+ // [position,limit), while extra capacity exists in the range [limit,capacity)
+ buffer.limit(0);
+
+ this.dispatcher = dispatcher;
+ // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
+ this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
+
+ try {
+ if(validate) {
+ System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this);
+ validatingInputStream = new BlockCompressedInputStream(reader.samFile);
+ // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE.
+ // Poke the stream to start reading data.
+ validatingInputStream.available();
+ }
+ else
+ validatingInputStream = null;
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+
+ public long length() {
+ return length;
+ }
+
+ public long getFilePointer() {
+ long filePointer;
+ synchronized(lock) {
+ if(buffer.remaining() > 0) {
+ // If there's data in the buffer, figure out from whence it came.
+ final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0;
+ final int blockOffset = buffer.position();
+ filePointer = blockAddress << 16 | blockOffset;
+ }
+ else {
+ // Otherwise, find the next position to load.
+ filePointer = position.getBlockAddress() << 16;
+ }
+ }
+
+ if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer())
+ throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)",
+ BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer),
+ BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer())));
+
+ return filePointer;
+ }
+
+ public void seek(long target) {
+ // TODO: Validate the seek point.
+ //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
+ synchronized(lock) {
+ clearBuffers();
+ position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target));
+ waitForBufferFill();
+ buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target));
+
+ if(validatingInputStream != null) {
+ try {
+ validatingInputStream.seek(target);
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+ }
+ }
+
+ private void clearBuffers() {
+ this.position.reset();
+
+ // Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
+ // Indicate no data to be read.
+ buffer.clear();
+ buffer.limit(0);
+
+ blockOffsets.clear();
+ blockPositions.clear();
+ }
+
+ public boolean eof() {
+ synchronized(lock) {
+ // TODO: Handle multiple empty BGZF blocks at end of the file.
+ return position != null && position.getBlockAddress() >= length;
+ }
+ }
+
+ public void setCheckCrcs(final boolean check) {
+ // TODO: Implement
+ }
+
+ /**
+ * Submits a new access plan for the given dataset.
+ * @param position The next seek point for BAM data in this reader.
+ */
+ public void submitAccessPlan(final SAMReaderPosition position) {
+ //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
+ synchronized(lock) {
+ // Assume that the access plan is going to tell us to start where we are and move forward.
+ // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
+ if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
+ position.advancePosition(this.position.getBlockAddress());
+ }
+ this.position = position;
+ }
+
+ private void compactBuffer() {
+ // Compact buffer to maximize storage space.
+ int bytesToRemove = 0;
+
+ // Look ahead to see if we can compact away the first block in the series.
+ while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) {
+ bytesToRemove += blockOffsets.remove();
+ blockPositions.remove();
+ }
+
+ // If we end up with an empty block at the end of the series, compact this as well.
+ if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) {
+ bytesToRemove += buffer.position();
+ blockOffsets.remove();
+ blockPositions.remove();
+ }
+
+ int finalBufferStart = buffer.position() - bytesToRemove;
+ int finalBufferSize = buffer.remaining();
+
+ buffer.position(bytesToRemove);
+ buffer.compact();
+
+ buffer.position(finalBufferStart);
+ buffer.limit(finalBufferStart+finalBufferSize);
+ }
+
+ /**
+ * Push contents of incomingBuffer into the end of this buffer.
+ * MUST be called from a thread that is NOT the reader thread.
+ * @param incomingBuffer The data being pushed into this input stream.
+ * @param position target position for the data.
+ */
+ public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
+ synchronized(lock) {
+ try {
+ compactBuffer();
+ // Open up the buffer for more reading.
+ buffer.limit(buffer.capacity());
+
+ // Advance the position to take the most recent read into account.
+ long lastReadPosition = position.getBlockAddress();
+
+ byte[] validBytes = null;
+ if(validatingInputStream != null) {
+ validBytes = new byte[incomingBuffer.remaining()];
+
+ byte[] currentBytes = new byte[incomingBuffer.remaining()];
+ int pos = incomingBuffer.position();
+ int lim = incomingBuffer.limit();
+ incomingBuffer.get(currentBytes);
+
+ incomingBuffer.limit(lim);
+ incomingBuffer.position(pos);
+
+ long currentFilePointer = validatingInputStream.getFilePointer();
+ validatingInputStream.seek(lastReadPosition << 16);
+ validatingInputStream.read(validBytes);
+ validatingInputStream.seek(currentFilePointer);
+
+ if(!Arrays.equals(validBytes,currentBytes))
+ throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
+ }
+
+ this.position = position;
+ position.advancePosition(filePosition);
+
+ if(buffer.remaining() < incomingBuffer.remaining()) {
+ //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
+ lock.wait();
+ //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
+ }
+
+ // Queue list of block offsets / block positions.
+ blockOffsets.add(buffer.position());
+ blockPositions.add(lastReadPosition);
+
+ buffer.put(incomingBuffer);
+
+ // Set up the buffer for reading.
+ buffer.flip();
+ bufferFilled = true;
+
+ lock.notify();
+ }
+ catch(Exception ex) {
+ reportException(ex);
+ lock.notify();
+ }
+ }
+ }
+
+ void reportException(Throwable t) {
+ synchronized(lock) {
+ this.error = t;
+ lock.notify();
+ }
+ }
+
+ private void checkForErrors() {
+ synchronized(lock) {
+ if(error != null) {
+ ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error);
+ toThrow.setStackTrace(error.getStackTrace());
+ throw toThrow;
+ }
+ }
+ }
+
+ /**
+ * Reads the next byte of data from the input stream.
+ * @return Next byte of data, from 0->255, as an int.
+ */
+ @Override
+ public int read() {
+ byte[] singleByte = new byte[1];
+ read(singleByte);
+ return singleByte[0];
+ }
+
+ /**
+ * Fills the given byte array to the extent possible.
+ * @param bytes byte array to be filled.
+ * @return The number of bytes actually read.
+ */
+ @Override
+ public int read(byte[] bytes) {
+ return read(bytes,0,bytes.length);
+ }
+
+ @Override
+ public int read(byte[] bytes, final int offset, final int length) {
+ int remaining = length;
+ synchronized(lock) {
+ while(remaining > 0) {
+ // Check for error conditions during last read.
+ checkForErrors();
+
+ // If completely out of space, queue up another buffer fill.
+ waitForBufferFill();
+
+ // Couldn't manage to load any data at all; abort and return what's available.
+ if(buffer.remaining() == 0)
+ break;
+
+ int numBytesToCopy = Math.min(buffer.remaining(),remaining);
+ buffer.get(bytes,length-remaining+offset,numBytesToCopy);
+ remaining -= numBytesToCopy;
+
+ //if(remaining > 0)
+ // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length);
+ // TODO: Assert that we don't copy across a block boundary
+ }
+
+ // Notify any waiting threads that some of the contents of the buffer were removed.
+ if(length-remaining > 0)
+ lock.notify();
+ }
+
+ if(validatingInputStream != null) {
+ byte[] validBytes = new byte[length];
+ try {
+ validatingInputStream.read(validBytes,offset,length);
+ for(int i = offset; i < offset+length; i++) {
+ if(bytes[i] != validBytes[i]) {
+ System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId());
+ throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i));
+ }
+ }
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+
+ return length - remaining;
+ }
+
+ public void close() {
+ if(validatingInputStream != null) {
+ try {
+ validatingInputStream.close();
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
+ }
+ }
+
+ public String getSource() {
+ return reader.getSamFilePath();
+ }
+
+ private void waitForBufferFill() {
+ synchronized(lock) {
+ bufferFilled = false;
+ if(buffer.remaining() == 0 && !eof()) {
+ //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
+ dispatcher.queueBlockLoad(position);
+ try {
+ lock.wait();
+ }
+ catch(InterruptedException ex) {
+ // TODO: handle me.
+ throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
+ }
+
+ if(bufferFilled && buffer.remaining() == 0)
+ throw new RuntimeEOFException("No more data left in InputStream");
+ }
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java
new file mode 100644
index 0000000000..ab42998026
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import org.broad.tribble.util.BlockCompressedStreamConstants;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.util.zip.DataFormatException;
+import java.util.zip.Inflater;
+
+/**
+ * An engine for loading blocks.
+ */
+class BlockLoader implements Runnable {
+ /**
+ * Coordinates the input queue.
+ */
+ private BGZFBlockLoadingDispatcher dispatcher;
+
+ /**
+ * A cache from which to retrieve open file handles.
+ */
+ private final FileHandleCache fileHandleCache;
+
+ /**
+ * Whether asynchronous decompression should happen.
+ */
+ private final boolean decompress;
+
+ /**
+ * An direct input buffer for incoming data from disk.
+ */
+ private final ByteBuffer inputBuffer;
+
+ public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) {
+ this.dispatcher = dispatcher;
+ this.fileHandleCache = fileHandleCache;
+ this.decompress = decompress;
+
+ this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
+ inputBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+ public void run() {
+ for(;;) {
+ SAMReaderPosition readerPosition = null;
+ try {
+ readerPosition = dispatcher.claimNextWorkRequest();
+ FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
+
+ long blockAddress = readerPosition.getBlockAddress();
+ //System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
+
+ ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
+ long nextBlockAddress = position(inputStream);
+ fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
+
+ ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
+ int bytesCopied = block.remaining();
+
+ BlockInputStream bamInputStream = readerPosition.getInputStream();
+ bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
+
+ //System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
+ }
+ catch(Throwable error) {
+ if(readerPosition != null && readerPosition.getInputStream() != null)
+ readerPosition.getInputStream().reportException(error);
+ }
+ }
+
+ }
+
+ private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException {
+ FileChannel channel = inputStream.getChannel();
+
+ // Read the block header
+ channel.position(blockAddress);
+
+ int uncompressedDataSize = 0;
+ int bufferSize = 0;
+
+ do {
+ inputBuffer.clear();
+ inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ channel.read(inputBuffer);
+
+ // Read out the size of the full BGZF block into a two bit short container, then 'or' that
+ // value into an int buffer to transfer the bitwise contents into an int.
+ inputBuffer.flip();
+ if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH)
+ throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass.");
+
+ // Verify that the file was read at a valid point.
+ if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 ||
+ unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 ||
+ unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG ||
+ unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN ||
+ unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 ||
+ unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) {
+ throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position");
+ }
+
+ inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET);
+ bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1;
+
+ // Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block.
+ inputBuffer.limit(bufferSize);
+ inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ channel.read(inputBuffer);
+
+ // Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block.
+ uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4);
+ //System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize);
+ }
+ while(uncompressedDataSize == 0 && channel.position() < channel.size());
+
+ // Prepare the buffer for reading.
+ inputBuffer.flip();
+
+ return inputBuffer;
+ }
+
+ private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException {
+ final int compressedBufferSize = bgzfBlock.remaining();
+
+ // Determine the uncompressed buffer size (
+ bgzfBlock.position(bgzfBlock.limit()-4);
+ int uncompressedBufferSize = bgzfBlock.getInt();
+ byte[] uncompressedContent = new byte[uncompressedBufferSize];
+
+ // Bound the CDATA section of the buffer.
+ bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH);
+ bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
+ byte[] compressedContent = new byte[bgzfBlock.remaining()];
+ ByteBuffer.wrap(compressedContent).put(bgzfBlock);
+
+ // Decompress the buffer.
+ final Inflater inflater = new Inflater(true);
+ inflater.setInput(compressedContent);
+ int bytesUncompressed = inflater.inflate(uncompressedContent);
+ if(bytesUncompressed != uncompressedBufferSize)
+ throw new ReviewedStingException("Error decompressing block");
+
+ return ByteBuffer.wrap(uncompressedContent);
+ }
+
+ private long position(final FileInputStream inputStream) throws IOException {
+ return inputStream.getChannel().position();
+ }
+
+ private int unpackUByte8(final ByteBuffer buffer,final int position) {
+ return buffer.get(position) & 0xFF;
+ }
+
+ private int unpackUInt16(final ByteBuffer buffer,final int position) {
+ // Read out the size of the full BGZF block into a two bit short container, then 'or' that
+ // value into an int buffer to transfer the bitwise contents into an int.
+ return buffer.getShort(position) & 0xFFFF;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java
new file mode 100644
index 0000000000..29de6eb370
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FileHandleCache.java
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+
+/**
+ * Caches frequently used file handles. Right now, caches only a single file handle.
+ * TODO: Generalize to support arbitrary file handle caches.
+ */
+public class FileHandleCache {
+ /**
+ * The underlying data structure storing file handles.
+ */
+ private final FileHandleStorage fileHandleStorage;
+
+ /**
+ * How many file handles should be kept open at once.
+ */
+ private final int cacheSize;
+
+ /**
+ * A uniquifier: assign a unique ID to every instance of a file handle.
+ */
+ private final Map keyCounter = new HashMap();
+
+ /**
+ * A shared lock, private so that outside users cannot notify it.
+ */
+ private final Object lock = new Object();
+
+ /**
+ * Indicates how many file handles are outstanding at this point.
+ */
+ private int numOutstandingFileHandles = 0;
+
+ /**
+ * Create a new file handle cache of the given cache size.
+ * @param cacheSize how many readers to hold open at once.
+ */
+ public FileHandleCache(final int cacheSize) {
+ this.cacheSize = cacheSize;
+ fileHandleStorage = new FileHandleStorage();
+ }
+
+ /**
+ * Retrieves or opens a file handle for the given reader ID.
+ * @param key The ke
+ * @return A file input stream from the cache, if available, or otherwise newly opened.
+ */
+ public FileInputStream claimFileInputStream(final SAMReaderID key) {
+ synchronized(lock) {
+ FileInputStream inputStream = findExistingEntry(key);
+ if(inputStream == null) {
+ try {
+ // If the cache is maxed out, wait for another file handle to emerge.
+ if(numOutstandingFileHandles >= cacheSize)
+ lock.wait();
+ }
+ catch(InterruptedException ex) {
+ throw new ReviewedStingException("Interrupted while waiting for a file handle");
+ }
+ inputStream = openInputStream(key);
+ }
+ numOutstandingFileHandles++;
+
+ //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId());
+ return inputStream;
+ }
+ }
+
+ /**
+ * Releases the current reader and returns it to the cache.
+ * @param key The reader.
+ * @param inputStream The stream being used.
+ */
+ public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) {
+ synchronized(lock) {
+ numOutstandingFileHandles--;
+ UniqueKey newID = allocateKey(key);
+ fileHandleStorage.put(newID,inputStream);
+ // Let any listeners know that another file handle has become available.
+ lock.notify();
+ }
+ }
+
+ /**
+ * Finds an existing entry in the storage mechanism.
+ * @param key Reader.
+ * @return a cached stream, if available. Otherwise,
+ */
+ private FileInputStream findExistingEntry(final SAMReaderID key) {
+ int existingHandles = getMostRecentUniquifier(key);
+
+ // See if any of the keys currently exist in the repository.
+ for(int i = 0; i <= existingHandles; i++) {
+ UniqueKey uniqueKey = new UniqueKey(key,i);
+ if(fileHandleStorage.containsKey(uniqueKey))
+ return fileHandleStorage.remove(uniqueKey);
+ }
+
+ return null;
+ }
+
+ /**
+ * Gets the most recent uniquifier used for the given reader.
+ * @param reader Reader for which to determine uniqueness.
+ * @return
+ */
+ private int getMostRecentUniquifier(final SAMReaderID reader) {
+ if(keyCounter.containsKey(reader))
+ return keyCounter.get(reader);
+ else return -1;
+ }
+
+ private UniqueKey allocateKey(final SAMReaderID reader) {
+ int uniquifier = getMostRecentUniquifier(reader)+1;
+ keyCounter.put(reader,uniquifier);
+ return new UniqueKey(reader,uniquifier);
+ }
+
+ private FileInputStream openInputStream(final SAMReaderID reader) {
+ try {
+ return new FileInputStream(reader.getSamFilePath());
+ }
+ catch(IOException ex) {
+ throw new StingException("Unable to open input file");
+ }
+ }
+
+ private void closeInputStream(final FileInputStream inputStream) {
+ try {
+ inputStream.close();
+ }
+ catch(IOException ex) {
+ throw new StingException("Unable to open input file");
+ }
+ }
+
+ /**
+ * Actually contains the file handles, purging them as they get too old.
+ */
+ private class FileHandleStorage extends LinkedHashMap {
+ /**
+ * Remove the oldest entry
+ * @param entry Entry to consider removing.
+ * @return True if the cache size has been exceeded. False otherwise.
+ */
+ @Override
+ protected boolean removeEldestEntry(Map.Entry entry) {
+ synchronized (lock) {
+ if(size() > cacheSize) {
+ keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1);
+ closeInputStream(entry.getValue());
+
+ return true;
+ }
+ }
+ return false;
+ }
+ }
+
+ /**
+ * Uniquifies a key by adding a numerical uniquifier.
+ */
+ private class UniqueKey {
+ /**
+ * The file handle's key.
+ */
+ private final SAMReaderID key;
+
+ /**
+ * A uniquifier, so that multiple of the same reader can exist in the cache.
+ */
+ private final int uniqueID;
+
+ public UniqueKey(final SAMReaderID reader, final int uniqueID) {
+ this.key = reader;
+ this.uniqueID = uniqueID;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if(!(other instanceof UniqueKey))
+ return false;
+ UniqueKey otherUniqueKey = (UniqueKey)other;
+ return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID;
+ }
+
+ @Override
+ public int hashCode() {
+ return key.hashCode();
+ }
+ }
+
+
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
index e4141f61c9..df7827250e 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
@@ -29,6 +29,7 @@
import net.sf.samtools.SAMFileSpan;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
@@ -40,28 +41,25 @@
*/
public class FilePointer {
protected final SortedMap fileSpans = new TreeMap();
- protected final BAMOverlap overlap;
- protected final List locations;
+ protected final List locations = new ArrayList();
/**
* Does this file pointer point into an unmapped region?
*/
protected final boolean isRegionUnmapped;
- public FilePointer() {
- this((BAMOverlap)null);
- }
-
- public FilePointer(final GenomeLoc location) {
- this.overlap = null;
- this.locations = Collections.singletonList(location);
- this.isRegionUnmapped = GenomeLoc.isUnmapped(location);
- }
-
- public FilePointer(final BAMOverlap overlap) {
- this.overlap = overlap;
- this.locations = new ArrayList();
- this.isRegionUnmapped = false;
+ public FilePointer(final GenomeLoc... locations) {
+ this.locations.addAll(Arrays.asList(locations));
+ boolean foundMapped = false, foundUnmapped = false;
+ for(GenomeLoc location: locations) {
+ if(GenomeLoc.isUnmapped(location))
+ foundUnmapped = true;
+ else
+ foundMapped = true;
+ }
+ if(foundMapped && foundUnmapped)
+ throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
+ this.isRegionUnmapped = foundUnmapped;
}
/**
@@ -217,4 +215,20 @@ private void mergeElementsInto(final FilePointer combined, Iterator entry: fileSpans.entrySet()) {
+ builder.append(entry.getKey());
+ builder.append("= {");
+ builder.append(entry.getValue());
+ builder.append("}");
+ }
+ return builder.toString();
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
index 4ddf28dced..f78693c27f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
@@ -25,419 +25,58 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
-import net.sf.samtools.AbstractBAMFileIndex;
-import net.sf.samtools.Bin;
-import net.sf.samtools.BrowseableBAMIndex;
-import net.sf.samtools.SAMSequenceRecord;
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.utils.GenomeLoc;
+import net.sf.samtools.SAMSequenceDictionary;
+import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-import org.broadinstitute.sting.utils.collections.Pair;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import java.util.*;
+import java.util.Iterator;
/**
- * Shard intervals based on position within the BAM file.
- *
- * @author mhanna
- * @version 0.1
+ * Handles the process of aggregating BAM intervals into individual shards.
+ * TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder.
*/
-public class IntervalSharder {
- private static Logger logger = Logger.getLogger(IntervalSharder.class);
-
- public static Iterator shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
- return new IntervalSharder.FilePointerIterator(dataSource,loci);
- }
-
+public class IntervalSharder implements Iterator {
/**
- * A lazy-loading iterator over file pointers.
+ * The iterator actually laying out the data for BAM scheduling.
*/
- private static class FilePointerIterator implements Iterator {
- final SAMDataSource dataSource;
- final GenomeLocSortedSet loci;
- final PeekableIterator locusIterator;
- final Queue cachedFilePointers = new LinkedList();
-
- public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
- this.dataSource = dataSource;
- this.loci = loci;
- locusIterator = new PeekableIterator(loci.iterator());
- advance();
- }
-
- public boolean hasNext() {
- return !cachedFilePointers.isEmpty();
- }
-
- public FilePointer next() {
- if(!hasNext())
- throw new NoSuchElementException("FilePointerIterator iteration is complete");
- FilePointer filePointer = cachedFilePointers.remove();
- if(cachedFilePointers.isEmpty())
- advance();
- return filePointer;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator");
- }
-
- private void advance() {
- GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
- String contig = null;
-
- // If the next section of the BAM to be processed is unmapped, handle this region separately.
- while(locusIterator.hasNext() && nextBatch.isEmpty()) {
- contig = null;
- while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) {
- GenomeLoc nextLocus = locusIterator.next();
- contig = nextLocus.getContig();
- nextBatch.add(nextLocus);
- }
- }
-
- if(nextBatch.size() > 0) {
- cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
- }
- }
- }
+ private final PeekableIterator wrappedIterator;
/**
- * Merge / split intervals based on an awareness of the structure of the BAM file.
- * @param dataSource
- * @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
- * @param loci
- * @return
+ * The parser, for interval manipulation.
*/
- private static List shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
- // If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
- if(contig == null) {
- FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
- for(SAMReaderID id: dataSource.getReaderIDs())
- filePointer.addFileSpans(id,null);
- return Collections.singletonList(filePointer);
- }
-
- // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
- List filePointers = new ArrayList();
- FilePointer lastFilePointer = null;
- BAMOverlap lastBAMOverlap = null;
-
- Map readerToIndexMap = new HashMap();
- IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator();
- for(SAMReaderID id: dataSource.getReaderIDs()) {
- final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
- // If this contig can't be found in the reference, skip over it.
- if(referenceSequence == null && contig != null)
- continue;
- final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
- binMerger.addReader(id,
- index,
- referenceSequence.getSequenceIndex(),
- index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator());
- // Cache the reader for later data lookup.
- readerToIndexMap.put(id,index);
- }
-
- PeekableIterator binIterator = new PeekableIterator(binMerger);
-
- for(GenomeLoc location: loci) {
- if(!location.getContig().equals(contig))
- throw new ReviewedStingException("Location outside bounds of contig");
-
- if(!binIterator.hasNext())
- break;
-
- int locationStart = location.getStart();
- final int locationStop = location.getStop();
-
- // Advance to first bin.
- while(binIterator.peek().stop < locationStart)
- binIterator.next();
+ private final GenomeLocParser parser;
- // Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make
- // sure the extending bin is not pruned from the list.
- List bamOverlaps = new ArrayList();
- while(binIterator.hasNext() && binIterator.peek().stop <= locationStop)
- bamOverlaps.add(binIterator.next());
- if(binIterator.hasNext() && binIterator.peek().start <= locationStop)
- bamOverlaps.add(binIterator.peek());
-
- // Bins found; try to match bins with locations.
- Iterator bamOverlapIterator = bamOverlaps.iterator();
-
- while(locationStop >= locationStart) {
- int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0;
- int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0;
-
- while(binStop < locationStart && bamOverlapIterator.hasNext()) {
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
- filePointers.add(lastFilePointer);
-
- lastBAMOverlap = bamOverlapIterator.next();
- lastFilePointer = new FilePointer(lastBAMOverlap);
- binStart = lastFilePointer.overlap.start;
- binStop = lastFilePointer.overlap.stop;
- }
-
- if(locationStart < binStart) {
- // The region starts before the first bin in the sequence. Add the region occurring before the sequence.
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
- filePointers.add(lastFilePointer);
- lastFilePointer = null;
- lastBAMOverlap = null;
- }
-
- final int regionStop = Math.min(locationStop,binStart-1);
-
- GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop);
- lastFilePointer = new FilePointer(subset);
-
- locationStart = regionStop + 1;
- }
- else if(locationStart > binStop) {
- // The region starts after the last bin in the sequence. Add the region occurring after the sequence.
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
- filePointers.add(lastFilePointer);
- lastFilePointer = null;
- lastBAMOverlap = null;
- }
-
- GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop);
- filePointers.add(new FilePointer(subset));
-
- locationStart = locationStop + 1;
- }
- else {
- if(lastFilePointer == null)
- throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer.");
-
- // The start of the region overlaps the bin. Add the overlapping subset.
- final int regionStop = Math.min(locationStop,binStop);
- lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop));
- locationStart = regionStop + 1;
- }
- }
- }
-
- if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
- filePointers.add(lastFilePointer);
-
- // Lookup the locations for every file pointer in the index.
- for(SAMReaderID id: readerToIndexMap.keySet()) {
- BrowseableBAMIndex index = readerToIndexMap.get(id);
- for(FilePointer filePointer: filePointers)
- filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
- }
-
- return filePointers;
+ public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
+ return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
}
- private static class BinMergingIterator implements Iterator {
- private PriorityQueue binQueue = new PriorityQueue();
- private Queue pendingOverlaps = new LinkedList();
-
- public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator bins) {
- binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins)));
- }
-
- public boolean hasNext() {
- return pendingOverlaps.size() > 0 || !binQueue.isEmpty();
- }
-
- public BAMOverlap next() {
- if(!hasNext())
- throw new NoSuchElementException("No elements left in merging iterator");
- if(pendingOverlaps.isEmpty())
- advance();
- return pendingOverlaps.remove();
- }
-
- public void advance() {
- List bins = new ArrayList();
- int boundsStart, boundsStop;
-
- // Prime the pump
- if(binQueue.isEmpty())
- return;
- bins.add(getNextBin());
- boundsStart = bins.get(0).getStart();
- boundsStop = bins.get(0).getStop();
-
- // Accumulate all the bins that overlap the current bin, in sorted order.
- while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) {
- ReaderBin bin = getNextBin();
- bins.add(bin);
- boundsStart = Math.min(boundsStart,bin.getStart());
- boundsStop = Math.max(boundsStop,bin.getStop());
- }
-
- List> range = new ArrayList>();
- int start = bins.get(0).getStart();
- int stop = bins.get(0).getStop();
- while(start <= boundsStop) {
- // Find the next stopping point.
- for(ReaderBin bin: bins) {
- stop = Math.min(stop,bin.getStop());
- if(start < bin.getStart())
- stop = Math.min(stop,bin.getStart()-1);
- }
-
- range.add(new Pair(start,stop));
- // If the last entry added included the last element, stop.
- if(stop >= boundsStop)
- break;
-
- // Find the next start.
- start = stop + 1;
- for(ReaderBin bin: bins) {
- if(start >= bin.getStart() && start <= bin.getStop())
- break;
- else if(start < bin.getStart()) {
- start = bin.getStart();
- break;
- }
- }
- }
-
- // Add the next series of BAM overlaps to the window.
- for(Pair window: range) {
- BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second);
- for(ReaderBin bin: bins)
- bamOverlap.addBin(bin.id,bin.bin);
- pendingOverlaps.add(bamOverlap);
- }
- }
-
- public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); }
-
- private ReaderBin peekNextBin() {
- if(binQueue.isEmpty())
- throw new NoSuchElementException("No more bins are available");
- BinQueueState current = binQueue.peek();
- return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
- }
-
- private ReaderBin getNextBin() {
- if(binQueue.isEmpty())
- throw new NoSuchElementException("No more bins are available");
- BinQueueState current = binQueue.remove();
- ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
- if(current.hasNextBin())
- binQueue.add(current);
- return readerBin;
- }
-
+ public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
+ return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
}
- /**
- * Filters out bins not at the lowest level in the tree.
- */
- private static class LowestLevelBinFilteringIterator implements Iterator {
- private BrowseableBAMIndex index;
- private Iterator wrappedIterator;
-
- private Bin nextBin;
-
- public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator iterator) {
- this.index = index;
- this.wrappedIterator = iterator;
- advance();
- }
-
- public boolean hasNext() {
- return nextBin != null;
- }
-
- public Bin next() {
- Bin bin = nextBin;
- advance();
- return bin;
- }
-
- public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); }
-
- private void advance() {
- nextBin = null;
- while(wrappedIterator.hasNext() && nextBin == null) {
- Bin bin = wrappedIterator.next();
- if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1)
- nextBin = bin;
- }
- }
+ public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
+ return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser());
}
-}
-
-class BinQueueState implements Comparable {
- private final SAMReaderID id;
- private final BrowseableBAMIndex index;
- private final int referenceSequence;
- private final PeekableIterator bins;
- private int firstLocusInCurrentBin;
- private int lastLocusInCurrentBin;
-
- public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator bins) {
- this.id = id;
- this.index = index;
- this.referenceSequence = referenceSequence;
- this.bins = new PeekableIterator(bins);
- refreshLocusInBinCache();
- }
-
- public SAMReaderID getReaderID() {
- return id;
- }
-
- public BrowseableBAMIndex getIndex() {
- return index;
- }
-
- public int getReferenceSequence() {
- return referenceSequence;
- }
-
- public boolean hasNextBin() {
- return bins.hasNext();
+ private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {
+ wrappedIterator = new PeekableIterator(scheduler);
+ this.parser = parser;
}
- public Bin peekNextBin() {
- return bins.peek();
+ public boolean hasNext() {
+ return wrappedIterator.hasNext();
}
- public Bin nextBin() {
- Bin nextBin = bins.next();
- refreshLocusInBinCache();
- return nextBin;
- }
-
- public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) {
- if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
- if(!this.bins.hasNext()) return -1;
- if(!this.bins.hasNext()) return 1;
-
- // Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
- if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
- other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
- throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid.");
- }
-
- // Straight integer subtraction works here because lhsStart, rhsStart always positive.
- if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
- return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
-
- // Straight integer subtraction works here because lhsStop, rhsStop always positive.
- return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
+ /**
+ * Accumulate shards where there's no additional cost to processing the next shard in the sequence.
+ * @return The next file pointer to process.
+ */
+ public FilePointer next() {
+ FilePointer current = wrappedIterator.next();
+ while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
+ current = current.combine(parser,wrappedIterator.next());
+ return current;
}
- private void refreshLocusInBinCache() {
- firstLocusInCurrentBin = -1;
- lastLocusInCurrentBin = -1;
- if(bins.hasNext()) {
- Bin bin = bins.peek();
- firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
- lastLocusInCurrentBin = index.getLastLocusInBin(bin);
- }
- }
-}
\ No newline at end of file
+ public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java
new file mode 100644
index 0000000000..585b63457c
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import java.util.Iterator;
+
+/**
+ * Batch granular file pointers into potentially larger shards.
+ */
+public class LocusShardBalancer extends ShardBalancer {
+ /**
+ * Convert iterators of file pointers into balanced iterators of shards.
+ * @return An iterator over balanced shards.
+ */
+ public Iterator iterator() {
+ return new Iterator() {
+ public boolean hasNext() {
+ return filePointers.hasNext();
+ }
+
+ public Shard next() {
+ FilePointer current = filePointers.next();
+ while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
+ current = current.combine(parser,filePointers.next());
+ return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
+ }
+ };
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java
deleted file mode 100755
index a5ca078534..0000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2010, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMFileSpan;
-import net.sf.samtools.SAMSequenceRecord;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-/**
- * A sharding strategy for loci based on reading of the index.
- */
-public class LocusShardStrategy implements ShardStrategy {
- /**
- * The data source to use when performing this sharding.
- */
- private final SAMDataSource reads;
-
- /**
- * the parser for creating shards
- */
- private GenomeLocParser genomeLocParser;
-
- /**
- * An iterator through the available file pointers.
- */
- private final Iterator filePointerIterator;
-
- /**
- * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
- * @param reads Data source from which to load index data.
- * @param locations List of locations for which to load data.
- */
- public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) {
- this.reads = reads;
- this.genomeLocParser = genomeLocParser;
-
- if(!reads.isEmpty()) {
- GenomeLocSortedSet intervals;
- if(locations == null) {
- // If no locations were passed in, shard the entire BAM file.
- SAMFileHeader header = reads.getHeader();
- intervals = new GenomeLocSortedSet(genomeLocParser);
-
- for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) {
- // Check this sequence against the reference sequence dictionary.
- // TODO: Do a better job of merging reads + reference.
- SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName());
- if(refSequenceRecord != null) {
- final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength());
- intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length));
- }
- }
- }
- else
- intervals = locations;
-
- if(reads.isLowMemoryShardingEnabled()) {
- /*
- Iterator filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
- List filePointers = new ArrayList();
- while(filePointerIterator.hasNext())
- filePointers.add(filePointerIterator.next());
- this.filePointerIterator = filePointers.iterator();
- */
- this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
- }
- else
- this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
- }
- else {
- final int maxShardSize = 100000;
- List filePointers = new ArrayList();
- if(locations == null) {
- for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
- for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
- final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
- filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)));
- }
- }
- }
- else {
- for(GenomeLoc interval: locations) {
- while(interval.size() > maxShardSize) {
- filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)));
- interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
- }
- filePointers.add(new FilePointer(interval));
- }
- }
- filePointerIterator = filePointers.iterator();
- }
-
- }
-
- /**
- * returns true if there are additional shards
- *
- * @return false if we're done processing shards
- */
- public boolean hasNext() {
- return filePointerIterator.hasNext();
- }
-
- public long shardNumber = 0;
-
- /**
- * gets the next Shard
- *
- * @return the next shard
- */
- public LocusShard next() {
- FilePointer nextFilePointer = filePointerIterator.next();
- Map fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
-
- /*
- System.out.printf("Shard %d: interval = {",++shardNumber);
- for(GenomeLoc locus: nextFilePointer.locations)
- System.out.printf("%s;",locus);
- System.out.printf("}; ");
-
- if(fileSpansBounding == null)
- System.out.printf("no shard data%n");
- else {
- SortedMap sortedSpans = new TreeMap(fileSpansBounding);
- for(Map.Entry entry: sortedSpans.entrySet()) {
- System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue());
- }
- }
- */
-
- return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding);
- }
-
- /** we don't support the remove command */
- public void remove() {
- throw new UnsupportedOperationException("ShardStrategies don't support remove()");
- }
-
- /**
- * makes the IntervalShard iterable, i.e. usable in a for loop.
- *
- * @return
- */
- public Iterator iterator() {
- return this;
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
deleted file mode 100644
index bf5f33dc34..0000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import net.sf.picard.util.PeekableIterator;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-
-import java.util.Iterator;
-
-/**
- * Handles the process of aggregating BAM intervals into individual shards.
- */
-public class LowMemoryIntervalSharder implements Iterator {
- /**
- * The iterator actually laying out the data for BAM scheduling.
- */
- private final PeekableIterator wrappedIterator;
-
- /**
- * The parser, for interval manipulation.
- */
- private final GenomeLocParser parser;
-
- public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
- wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci));
- parser = loci.getGenomeLocParser();
- }
-
- public boolean hasNext() {
- return wrappedIterator.hasNext();
- }
-
- /**
- * Accumulate shards where there's no additional cost to processing the next shard in the sequence.
- * @return The next file pointer to process.
- */
- public FilePointer next() {
- FilePointer current = wrappedIterator.next();
- while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
- current = current.combine(parser,wrappedIterator.next());
- return current;
- }
-
- public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java
deleted file mode 100644
index 278eeb8989..0000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShard.java
+++ /dev/null
@@ -1,34 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-
-import java.util.List;
-
-/**
- * A single, monolithic shard bridging all available data.
- * @author mhanna
- * @version 0.1
- */
-public class MonolithicShard extends Shard {
- /**
- * Creates a new monolithic shard of the given type.
- * @param shardType Type of the shard. Must be either read or locus; cannot be intervalic.
- * @param locs Intervals that this monolithic shard should process.
- */
- public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List locs) {
- super(parser, shardType, locs, readsDataSource, null, false);
- if(shardType != ShardType.LOCUS && shardType != ShardType.READ)
- throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType);
- }
-
- /**
- * String representation of this shard.
- * @return "entire genome".
- */
- @Override
- public String toString() {
- return "entire genome";
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java
deleted file mode 100644
index 28b737f283..0000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/MonolithicShardStrategy.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.reads;
-
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-
-import java.util.Iterator;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * Create a giant shard representing all the data in the input BAM(s).
- *
- * @author mhanna
- * @version 0.1
- */
-public class MonolithicShardStrategy implements ShardStrategy {
- /**
- * The single shard associated with this sharding strategy.
- */
- private MonolithicShard shard;
-
- /**
- * Create a new shard strategy for shards of the given type.
- * @param shardType The shard type.
- */
- public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List region) {
- shard = new MonolithicShard(parser,readsDataSource,shardType,region);
- }
-
- /**
- * Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator;
- * will only return another copy of the active iterator.
- * @return A copy of this.
- */
- public Iterator iterator() {
- return this;
- }
-
- /**
- * Returns true if the monolithic shard has not yet been consumed, or false otherwise.
- * @return True if shard has been consumed, false otherwise.
- */
- public boolean hasNext() {
- return shard != null;
- }
-
- /**
- * Returns the monolithic shard if it has not already been retrieved.
- * @return The monolithic shard.
- * @throws NoSuchElementException if no such data exists.
- */
- public Shard next() {
- if(shard == null)
- throw new NoSuchElementException("Monolithic shard has already been retrived.");
-
- Shard working = shard;
- shard = null;
- return working;
- }
-
- /**
- * Mandated by the interface, but is unsupported in this context. Will throw an exception always.
- */
- public void remove() {
- throw new UnsupportedOperationException("Cannot remove from a shard strategy");
- }
-
- /**
- * Mandated by the interface, but is unsupported in this context. Will throw an exception always.
- * @param size adjust the next size to this
- */
- public void adjustNextShardSize( long size ) {
- throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard.");
- }
-
-}
-
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
index 4d9c9092d6..8d73b1b158 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
@@ -35,15 +35,29 @@
* @version 0.1
*/
public class ReadShard extends Shard {
+ /**
+ * What is the maximum number of reads which should go into a read shard.
+ */
+ public static int MAX_READS = 10000;
+
/**
* The reads making up this shard.
*/
- private final Collection reads = new ArrayList(ReadShardStrategy.MAX_READS);
+ private final Collection reads = new ArrayList(MAX_READS);
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) {
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
}
+ /**
+ * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface
+ * until we know what effect tuning this parameter has.
+ * @param bufferSize New maximum number
+ */
+ static void setReadBufferSize(final int bufferSize) {
+ MAX_READS = bufferSize;
+ }
+
/**
* Returns true if this shard is meant to buffer reads, rather
* than just holding pointers to their locations.
@@ -66,7 +80,7 @@ public boolean isBufferEmpty() {
* @return True if this shard's buffer is full (and the shard can buffer reads).
*/
public boolean isBufferFull() {
- return reads.size() > ReadShardStrategy.MAX_READS;
+ return reads.size() > ReadShard.MAX_READS;
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java
new file mode 100644
index 0000000000..311c7874f9
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.SAMFileSpan;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+
+/**
+ * Divide up large file pointers containing reads into more manageable subcomponents.
+ */
+public class ReadShardBalancer extends ShardBalancer {
+ /**
+ * Convert iterators of file pointers into balanced iterators of shards.
+ * @return An iterator over balanced shards.
+ */
+ public Iterator