diff --git a/build.xml b/build.xml
index abe3a32a15..d3e25d4244 100644
--- a/build.xml
+++ b/build.xml
@@ -1,5 +1,5 @@
-
+
+
+
+
+
+
+
+
diff --git a/ivy.xml b/ivy.xml
index 4f41904ba4..06296c6b4a 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -1,3 +1,26 @@
+
@@ -18,10 +41,9 @@
-
+
-
@@ -40,17 +62,17 @@
-
+
-
+
-
+
diff --git a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R
index d5ee3626f4..ae340e688d 100644
--- a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R
+++ b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R
@@ -1,6 +1,7 @@
library(gsalib)
-require("ggplot2")
-require("gplots")
+library(ggplot2)
+library(gplots)
+library(tools)
#
# Standard command line switch. Can we loaded interactively for development
@@ -201,4 +202,7 @@ for ( group in gatkReportData ) {
if ( ! is.na(outputPDF) ) {
dev.off()
-}
+ if (exists("compactPDF")) {
+ compactPDF(outputPDF)
+ }
+}
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
index 46bbf7eda5..876cf5cbc9 100644
--- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
@@ -4,11 +4,9 @@
colnames(d) = tableHeader;
for (i in 1:ncol(d)) {
- v = suppressWarnings(as.numeric(d[,i]));
-
- if (length(na.omit(as.numeric(v))) == length(d[,i])) {
- d[,i] = v;
- }
+ # use the general type.convert infrastructure of read.table to convert column data to R types
+ v = type.convert(d[,i])
+ d[,i] = v;
}
usedNames = ls(envir=tableEnv, pattern=tableName);
diff --git a/public/c/bwa/build_linux.sh b/public/c/bwa/build_linux.sh
index b3631a28df..8683bb3772 100755
--- a/public/c/bwa/build_linux.sh
+++ b/public/c/bwa/build_linux.sh
@@ -1,5 +1,5 @@
#!/bin/sh
-export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa-trunk/bwa"
+export BWA_HOME="/humgen/gsa-scr1/hanna/src/bio-bwa/bwa"
export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux"
export TARGET_LIB="libbwa.so"
export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread"
diff --git a/public/c/bwa/bwa_gateway.cpp b/public/c/bwa/bwa_gateway.cpp
index 00f5aa5bcd..088ee43bf9 100644
--- a/public/c/bwa/bwa_gateway.cpp
+++ b/public/c/bwa/bwa_gateway.cpp
@@ -233,6 +233,8 @@ void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_s
void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; }
void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; }
void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; }
+void BWA::set_mode_nonstop() { options.mode |= BWA_MODE_NONSTOP; options.max_top2 = 0x7fffffff; }
+void BWA::set_max_entries_in_queue(int max_entries) { options.max_entries = max_entries; }
/**
* Create a sequence with a set of reasonable initial defaults.
diff --git a/public/c/bwa/bwa_gateway.h b/public/c/bwa/bwa_gateway.h
index 2d26ec6509..62756ec2a1 100644
--- a/public/c/bwa/bwa_gateway.h
+++ b/public/c/bwa/bwa_gateway.h
@@ -60,6 +60,8 @@ class BWA {
void set_mismatch_penalty(int penalty);
void set_gap_open_penalty(int penalty);
void set_gap_extension_penalty(int penalty);
+ void set_mode_nonstop();
+ void set_max_entries_in_queue(int max_entries);
// Perform the alignment
Alignment* generate_single_alignment(const char* bases,
diff --git a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp
index 1ccbef0d41..90d70d4a1b 100644
--- a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp
+++ b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp
@@ -8,11 +8,13 @@
#include "bwa_gateway.h"
#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h"
+typedef void (BWA::*boolean_setter)();
typedef void (BWA::*int_setter)(int value);
typedef void (BWA::*float_setter)(float value);
static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment);
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name);
+static void set_boolean_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, boolean_setter setter);
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter);
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter);
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message);
@@ -100,6 +102,10 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
if(env->ExceptionCheck()) return;
set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty);
if(env->ExceptionCheck()) return;
+ set_boolean_configuration_param(env, configuration, "nonStopMode", bwa, &BWA::set_mode_nonstop);
+ if(env->ExceptionCheck()) return;
+ set_int_configuration_param(env, configuration, "maxEntriesInQueue", bwa, &BWA::set_max_entries_in_queue);
+ if(env->ExceptionCheck()) return;
}
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases)
@@ -357,6 +363,36 @@ static jstring get_configuration_file(JNIEnv* env, jobject configuration, const
return path;
}
+static void set_boolean_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, boolean_setter setter) {
+ jclass configuration_class = env->GetObjectClass(configuration);
+ if(configuration_class == NULL) return;
+
+ jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Boolean;");
+ if(configuration_field == NULL) return;
+
+ jobject boxed_value = env->GetObjectField(configuration,configuration_field);
+ if(env->ExceptionCheck()) return;
+
+ if(boxed_value != NULL) {
+ jclass boolean_box_class = env->FindClass("java/lang/Boolean");
+ if(boolean_box_class == NULL) return;
+
+ jmethodID boolean_extractor = env->GetMethodID(boolean_box_class,"booleanValue", "()Z");
+ if(boolean_extractor == NULL) return;
+
+ jboolean value = env->CallBooleanMethod(boxed_value,boolean_extractor);
+ if(env->ExceptionCheck()) return;
+
+ if(value)
+ (bwa->*setter)();
+
+ env->DeleteLocalRef(boolean_box_class);
+ }
+
+ env->DeleteLocalRef(boxed_value);
+ env->DeleteLocalRef(configuration_class);
+}
+
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) {
jclass configuration_class = env->GetObjectClass(configuration);
if(configuration_class == NULL) return;
diff --git a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java
deleted file mode 100644
index 4b1c7a9994..0000000000
--- a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-package net.sf.picard.sam;
-
-import net.sf.picard.PicardException;
-
-import java.util.*;
-import java.lang.reflect.Constructor;
-
-import net.sf.samtools.*;
-import net.sf.samtools.util.CloseableIterator;
-
-/**
- * Provides an iterator interface for merging multiple underlying iterators into a single
- * iterable stream. The underlying iterators/files must all have the same sort order unless
- * the requested output format is unsorted, in which case any combination is valid.
- */
-public class MergingSamRecordIterator implements CloseableIterator {
- private final PriorityQueue pq;
- private final SamFileHeaderMerger samHeaderMerger;
- private final Collection readers;
- private final SAMFileHeader.SortOrder sortOrder;
- private final SAMRecordComparator comparator;
-
- private boolean initialized = false;
- private boolean iterationStarted = false;
-
- /**
- * Constructs a new merging iterator with the same set of readers and sort order as
- * provided by the header merger parameter.
- * @param headerMerger The merged header and contents of readers.
- * @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
- * @deprecated replaced by (SamFileHeaderMerger, Collection, boolean)
- */
- public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) {
- this(headerMerger, headerMerger.getReaders(), forcePresorted);
- }
-
- /**
- * Constructs a new merging iterator with the same set of readers and sort order as
- * provided by the header merger parameter.
- * @param headerMerger The merged header and contents of readers.
- * @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order.
- */
- public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection readers, final boolean assumeSorted) {
- this.samHeaderMerger = headerMerger;
- this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
- this.comparator = getComparator();
- this.readers = readers;
-
- this.pq = new PriorityQueue(readers.size());
-
- for (final SAMFileReader reader : readers) {
- if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted &&
- reader.getFileHeader().getSortOrder() != this.sortOrder){
- throw new PicardException("Files are not compatible with sort order");
- }
- }
- }
-
- /**
- * Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval,
- * rather than iterating over every read in the backing file or stream.
- * @param reader Reader to add to the merging iterator.
- * @param iterator Iterator traversing over reader contents.
- */
- public void addIterator(final SAMFileReader reader, final CloseableIterator iterator) {
- if(iterationStarted)
- throw new PicardException("Cannot add another iterator; iteration has already begun");
- if(!samHeaderMerger.containsHeader(reader.getFileHeader()))
- throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger");
- final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator);
- addIfNotEmpty(comparableIterator);
- initialized = true;
- }
-
- private void startIterationIfRequired() {
- if(initialized)
- return;
- for(SAMFileReader reader: readers)
- addIterator(reader,reader.iterator());
- iterationStarted = true;
- }
-
- /**
- * Close down all open iterators.
- */
- public void close() {
- // Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue.
- for(CloseableIterator iterator: pq)
- iterator.close();
- }
-
- /** Returns true if any of the underlying iterators has more records, otherwise false. */
- public boolean hasNext() {
- startIterationIfRequired();
- return !this.pq.isEmpty();
- }
-
- /** Returns the next record from the top most iterator during merging. */
- public SAMRecord next() {
- startIterationIfRequired();
-
- final ComparableSamRecordIterator iterator = this.pq.poll();
- final SAMRecord record = iterator.next();
- addIfNotEmpty(iterator);
- record.setHeader(this.samHeaderMerger.getMergedHeader());
-
- // Fix the read group if needs be
- if (this.samHeaderMerger.hasReadGroupCollisions()) {
- final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
- if (oldGroupId != null ) {
- final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId);
- record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId);
- }
- }
-
- // Fix the program group if needs be
- if (this.samHeaderMerger.hasProgramGroupCollisions()) {
- final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
- if (oldGroupId != null ) {
- final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId);
- record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId);
- }
- }
-
- // Fix up the sequence indexes if needs be
- if (this.samHeaderMerger.hasMergedSequenceDictionary()) {
- if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex()));
- }
-
- if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex()));
- }
- }
-
- return record;
- }
-
- /**
- * Adds iterator to priority queue. If the iterator has more records it is added
- * otherwise it is closed and not added.
- */
- private void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
- if (iterator.hasNext()) {
- pq.offer(iterator);
- }
- else {
- iterator.close();
- }
- }
-
- /** Unsupported operation. */
- public void remove() {
- throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
- }
-
- /**
- * Get the right comparator for a given sort order (coordinate, alphabetic). In the
- * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
- * ordering.
- */
- private SAMRecordComparator getComparator() {
- // For unsorted build a fake comparator that compares based on object ID
- if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
- return new SAMRecordComparator() {
- public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
- return System.identityHashCode(lhs) - System.identityHashCode(rhs);
- }
-
- public int compare(final SAMRecord lhs, final SAMRecord rhs) {
- return fileOrderCompare(lhs, rhs);
- }
- };
- }
- if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
- return new MergedSequenceDictionaryCoordinateOrderComparator();
- }
-
- // Otherwise try and figure out what kind of comparator to return and build it
- return this.sortOrder.getComparatorInstance();
- }
-
- /** Returns the merged header that the merging iterator is working from. */
- public SAMFileHeader getMergedHeader() {
- return this.samHeaderMerger.getMergedHeader();
- }
-
- /**
- * Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged
- * sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids
- * more copy & paste.
- */
- private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator {
-
- public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
- final int referenceIndex1 = getReferenceIndex(samRecord1);
- final int referenceIndex2 = getReferenceIndex(samRecord2);
- if (referenceIndex1 != referenceIndex2) {
- if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- return 1;
- } else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- return -1;
- } else {
- return referenceIndex1 - referenceIndex2;
- }
- }
- if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- // Both are unmapped.
- return 0;
- }
- return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart();
- }
-
- private int getReferenceIndex(final SAMRecord samRecord) {
- if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex());
- }
- if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex());
- }
- return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
- }
- }
-}
diff --git a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java
deleted file mode 100644
index f78cd81dac..0000000000
--- a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java
+++ /dev/null
@@ -1,744 +0,0 @@
-/*
- * The MIT License
- *
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-package net.sf.picard.sam;
-
-import java.util.*;
-
-import net.sf.picard.PicardException;
-import net.sf.samtools.AbstractSAMHeaderRecord;
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMFileReader;
-import net.sf.samtools.SAMProgramRecord;
-import net.sf.samtools.SAMReadGroupRecord;
-import net.sf.samtools.SAMSequenceDictionary;
-import net.sf.samtools.SAMSequenceRecord;
-import net.sf.samtools.util.SequenceUtil;
-
-/**
- * Merges SAMFileHeaders that have the same sequences into a single merged header
- * object while providing read group translation for cases where read groups
- * clash across input headers.
- */
-public class SamFileHeaderMerger {
- //Super Header to construct
- private final SAMFileHeader mergedHeader;
- private Collection readers;
- private final Collection headers;
-
- //Translation of old group ids to new group ids
- private final Map> samReadGroupIdTranslation =
- new IdentityHashMap>();
-
- //the read groups from different files use the same group ids
- private boolean hasReadGroupCollisions = false;
-
- //the program records from different files use the same program record ids
- private boolean hasProgramGroupCollisions = false;
-
- //Translation of old program group ids to new program group ids
- private Map> samProgramGroupIdTranslation =
- new IdentityHashMap>();
-
- private boolean hasMergedSequenceDictionary = false;
-
- // Translation of old sequence dictionary ids to new dictionary ids
- // This is an IdentityHashMap because it can be quite expensive to compute the hashCode for
- // large SAMFileHeaders. It is possible that two input files will have identical headers so that
- // the regular HashMap would fold them together, but the value stored in each of the two
- // Map entries will be the same, so it should not hurt anything.
- private final Map> samSeqDictionaryIdTranslationViaHeader =
- new IdentityHashMap>();
-
- //HeaderRecordFactory that creates SAMReadGroupRecord instances.
- private static final HeaderRecordFactory READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory() {
- public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) {
- return new SAMReadGroupRecord(id, srcReadGroupRecord);
- }
- };
-
- //HeaderRecordFactory that creates SAMProgramRecord instances.
- private static final HeaderRecordFactory PROGRAM_RECORD_FACTORY = new HeaderRecordFactory() {
- public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) {
- return new SAMProgramRecord(id, srcProgramRecord);
- }
- };
-
- //comparator used to sort lists of program group and read group records
- private static final Comparator RECORD_ID_COMPARATOR = new Comparator() {
- public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) {
- return o1.getId().compareTo(o2.getId());
- }
- };
-
- /**
- * Create SAMFileHeader with additional information. Required that sequence dictionaries agree.
- *
- * @param readers sam file readers to combine
- * @param sortOrder sort order new header should have
- * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean)
- */
- public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) {
- this(readers, sortOrder, false);
- }
-
- /**
- * Create SAMFileHeader with additional information.
- *
- * @param readers sam file readers to combine
- * @param sortOrder sort order new header should have
- * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
- * all input sequence dictionaries be identical.
- * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean)
- */
- public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) {
- this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries);
- this.readers = readers;
- }
-
- /**
- * Create SAMFileHeader with additional information.. This is the preferred constructor.
- *
- * @param sortOrder sort order new header should have
- * @param headers sam file headers to combine
- * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
- * all input sequence dictionaries be identical.
- */
- public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection headers, final boolean mergeDictionaries) {
- this.headers = headers;
- this.mergedHeader = new SAMFileHeader();
-
- SAMSequenceDictionary sequenceDictionary;
- try {
- sequenceDictionary = getSequenceDictionary(headers);
- this.hasMergedSequenceDictionary = false;
- }
- catch (SequenceUtil.SequenceListsDifferException pe) {
- if (mergeDictionaries) {
- sequenceDictionary = mergeSequenceDictionaries(headers);
- this.hasMergedSequenceDictionary = true;
- }
- else {
- throw pe;
- }
- }
-
- this.mergedHeader.setSequenceDictionary(sequenceDictionary);
-
- // Set program that creates input alignments
- for (final SAMProgramRecord program : mergeProgramGroups(headers)) {
- this.mergedHeader.addProgramRecord(program);
- }
-
- // Set read groups for merged header
- final List readGroups = mergeReadGroups(headers);
- this.mergedHeader.setReadGroups(readGroups);
- this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none);
-
- this.mergedHeader.setSortOrder(sortOrder);
-
- for (final SAMFileHeader header : headers) {
- for (final String comment : header.getComments()) {
- this.mergedHeader.addComment(comment);
- }
- }
- }
-
- // Utilility method to make use with old constructor
- private static List getHeadersFromReaders(Collection readers) {
- List headers = new ArrayList(readers.size());
- for (SAMFileReader reader : readers) {
- headers.add(reader.getFileHeader());
- }
- return headers;
- }
-
-
- /**
- * Checks to see if there are clashes where different readers are using the same read
- * group IDs. If yes, then those IDs that collided are remapped.
- *
- * @param headers headers to combine
- * @return new list of read groups constructed from all the readers
- */
- private List mergeReadGroups(final Collection headers) {
- //prepare args for mergeHeaderRecords(..) call
- final HashSet idsThatAreAlreadyTaken = new HashSet();
-
- final List> readGroupsToProcess = new LinkedList>();
- for (final SAMFileHeader header : headers) {
- for (final SAMReadGroupRecord readGroup : header.getReadGroups()) {
- //verify that there are no existing id collisions in this input file
- if(!idsThatAreAlreadyTaken.add(readGroup.getId()))
- throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")");
-
- readGroupsToProcess.add(new HeaderRecordAndFileHeader(readGroup, header));
- }
- idsThatAreAlreadyTaken.clear();
- }
-
- final List result = new LinkedList();
-
- hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result);
-
- //sort the result list by record id
- Collections.sort(result, RECORD_ID_COMPARATOR);
-
- return result;
- }
-
-
- /**
- * Checks to see if there are clashes where different readers are using the same program
- * group IDs. If yes, then those IDs that collided are remapped.
- *
- * @param headers headers to combine
- * @return new list of program groups constructed from all the readers
- */
- private List mergeProgramGroups(final Collection headers) {
-
- final List overallResult = new LinkedList();
-
- //this Set will accumulate all SAMProgramRecord ids that have been encountered so far.
- final HashSet idsThatAreAlreadyTaken = new HashSet();
-
- //need to process all program groups
- List> programGroupsLeftToProcess = new LinkedList>();
- for (final SAMFileHeader header : headers) {
- for (final SAMProgramRecord programGroup : header.getProgramRecords()) {
- //verify that there are no existing id collisions in this input file
- if(!idsThatAreAlreadyTaken.add(programGroup.getId()))
- throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")");
-
- programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader(programGroup, header));
- }
- idsThatAreAlreadyTaken.clear();
- }
-
- //A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to
- //another program group header (lets say ID=1 PN=A) to indicate that the given file was
- //processed by program A followed by program B. These PP attributes potentially
- //connect headers into one or more tree structures. Merging is done by
- //first merging all headers that don't have PP attributes (eg. tree roots),
- //then updating and merging all headers whose PPs point to the tree-root headers,
- //and so on until all program group headers are processed.
-
- //currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots).
- List< HeaderRecordAndFileHeader > currentProgramGroups = new LinkedList>();
- for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
- final HeaderRecordAndFileHeader pair = programGroupsLeftToProcessIterator.next();
- if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) {
- programGroupsLeftToProcessIterator.remove();
- currentProgramGroups.add(pair);
- }
- }
-
- //merge currentProgramGroups
- while(!currentProgramGroups.isEmpty())
- {
- final List currentResult = new LinkedList();
-
- hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult);
-
- //add currentResults to overallResults
- overallResult.addAll(currentResult);
-
- //apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess
- currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false);
- programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true);
-
- //find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups),
- //and move them to the list of programGroupsToProcessNext.
- LinkedList> programGroupsToProcessNext = new LinkedList>();
- for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
- final HeaderRecordAndFileHeader pairLeftToProcess = programGroupsLeftToProcessIterator.next();
- final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
- //find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file)
- for(final HeaderRecordAndFileHeader justProcessedPair : currentProgramGroups) {
- String idJustProcessed = justProcessedPair.getHeaderRecord().getId();
- if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) {
- programGroupsLeftToProcessIterator.remove();
- programGroupsToProcessNext.add(pairLeftToProcess);
- break;
- }
- }
- }
-
- currentProgramGroups = programGroupsToProcessNext;
- }
-
- //verify that all records were processed
- if(!programGroupsLeftToProcess.isEmpty()) {
- StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n");
- for( final HeaderRecordAndFileHeader pair : programGroupsLeftToProcess ) {
- SAMProgramRecord record = pair.getHeaderRecord();
- errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n");
- }
- throw new PicardException(errorMsg.toString());
- }
-
- //sort the result list by record id
- Collections.sort(overallResult, RECORD_ID_COMPARATOR);
-
- return overallResult;
- }
-
-
- /**
- * Utility method that takes a list of program groups and remaps all their
- * ids (including ppIds if requested) using the given idTranslationTable.
- *
- * NOTE: when remapping, this method creates new SAMProgramRecords and
- * doesn't mutate any records in the programGroups list.
- *
- * @param programGroups The program groups to translate.
- * @param idTranslationTable The translation table.
- * @param translatePpIds Whether ppIds should be translated as well.
- *
- * @return The list of translated records.
- */
- private List> translateIds(
- List> programGroups,
- Map> idTranslationTable,
- boolean translatePpIds) {
-
- //go through programGroups and translate any IDs and PPs based on the idTranslationTable.
- List> result = new LinkedList>();
- for(final HeaderRecordAndFileHeader pair : programGroups ) {
- final SAMProgramRecord record = pair.getHeaderRecord();
- final String id = record.getProgramGroupId();
- final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
-
- final SAMFileHeader header = pair.getFileHeader();
- final Map translations = idTranslationTable.get(header);
-
- //see if one or both ids need to be translated
- SAMProgramRecord translatedRecord = null;
- if(translations != null)
- {
- String translatedId = translations.get( id );
- String translatedPpId = translatePpIds ? translations.get( ppId ) : null;
-
- boolean needToTranslateId = translatedId != null && !translatedId.equals(id);
- boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId);
-
- if(needToTranslateId && needToTranslatePpId) {
- translatedRecord = new SAMProgramRecord(translatedId, record);
- translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
- } else if(needToTranslateId) {
- translatedRecord = new SAMProgramRecord(translatedId, record);
- } else if(needToTranslatePpId) {
- translatedRecord = new SAMProgramRecord(id, record);
- translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
- }
- }
-
- if(translatedRecord != null) {
- result.add(new HeaderRecordAndFileHeader(translatedRecord, header));
- } else {
- result.add(pair); //keep the original record
- }
- }
-
- return result;
- }
-
-
- /**
- * Utility method for merging a List of AbstractSAMHeaderRecords. If it finds
- * records that have identical ids and attributes, it will collapse them
- * into one record. If it finds records that have identical ids but
- * non-identical attributes, this is treated as a collision. When collision happens,
- * the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable.
- *
- * NOTE: Non-collided records also get recorded in the idTranslationTable as
- * old-id to old-id. This way, an idTranslationTable lookup should never return null.
- *
- * @param headerRecords The header records to merge.
- * @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord.
- * @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped.
- * @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id
- * mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as
- * old-id to old-id. This way, an idTranslationTable lookup should never return null.
- *
- * @param result The list of merged header records.
- *
- * @return True if there were collisions.
- */
- private boolean mergeHeaderRecords(final List> headerRecords, HeaderRecordFactory headerRecordFactory,
- final HashSet idsThatAreAlreadyTaken, Map> idTranslationTable, List result) {
-
- //The outer Map bins the header records by their ids. The nested Map further collapses
- //header records which, in addition to having the same id, also have identical attributes.
- //In other words, each key in the nested map represents one or more
- //header records which have both identical ids and identical attributes. The List of
- //SAMFileHeaders keeps track of which readers these header record(s) came from.
- final Map>> idToRecord =
- new HashMap>>();
-
- //Populate the idToRecord and seenIds data structures
- for (final HeaderRecordAndFileHeader pair : headerRecords) {
- final RecordType record = pair.getHeaderRecord();
- final SAMFileHeader header = pair.getFileHeader();
- final String recordId = record.getId();
- Map> recordsWithSameId = idToRecord.get(recordId);
- if(recordsWithSameId == null) {
- recordsWithSameId = new LinkedHashMap>();
- idToRecord.put(recordId, recordsWithSameId);
- }
-
- List fileHeaders = recordsWithSameId.get(record);
- if(fileHeaders == null) {
- fileHeaders = new LinkedList();
- recordsWithSameId.put(record, fileHeaders);
- }
-
- fileHeaders.add(header);
- }
-
- //Resolve any collisions between header records by remapping their ids.
- boolean hasCollisions = false;
- for (final Map.Entry>> entry : idToRecord.entrySet() )
- {
- final String recordId = entry.getKey();
- final Map> recordsWithSameId = entry.getValue();
-
-
- for( Map.Entry> recordWithUniqueAttr : recordsWithSameId.entrySet()) {
- final RecordType record = recordWithUniqueAttr.getKey();
- final List fileHeaders = recordWithUniqueAttr.getValue();
-
- String newId;
- if(!idsThatAreAlreadyTaken.contains(recordId)) {
- //don't remap 1st record. If there are more records
- //with this id, they will be remapped in the 'else'.
- newId = recordId;
- idsThatAreAlreadyTaken.add(recordId);
- } else {
- //there is more than one record with this id.
- hasCollisions = true;
-
- //find a unique newId for this record
- int idx=1;
- while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++)))
- ;
-
- idsThatAreAlreadyTaken.add( newId );
- }
-
- for(SAMFileHeader fileHeader : fileHeaders) {
- Map readerTranslationTable = idTranslationTable.get(fileHeader);
- if(readerTranslationTable == null) {
- readerTranslationTable = new HashMap();
- idTranslationTable.put(fileHeader, readerTranslationTable);
- }
- readerTranslationTable.put(recordId, newId);
- }
-
- result.add( headerRecordFactory.createRecord(newId, record) );
- }
- }
-
- return hasCollisions;
- }
-
-
- /**
- * Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence
- * are different from one another.
- *
- * @param headers headers to pull sequences from
- * @return sequences from files. Each file should have the same sequence
- */
- private SAMSequenceDictionary getSequenceDictionary(final Collection headers) {
- SAMSequenceDictionary sequences = null;
- for (final SAMFileHeader header : headers) {
-
- if (sequences == null) {
- sequences = header.getSequenceDictionary();
- }
- else {
- final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
- SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences);
- }
- }
-
- return sequences;
- }
-
- /**
- * Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries.
- *
- * @param headers headers to pull sequences from
- * @return sequences from files. Each file should have the same sequence
- */
- private SAMSequenceDictionary mergeSequenceDictionaries(final Collection headers) {
- SAMSequenceDictionary sequences = new SAMSequenceDictionary();
- for (final SAMFileHeader header : headers) {
- final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
- sequences = mergeSequences(sequences, currentSequences);
- }
- // second pass, make a map of the original seqeunce id -> new sequence id
- createSequenceMapping(headers, sequences);
- return sequences;
- }
-
- /**
- * They've asked to merge the sequence headers. What we support right now is finding the sequence name superset.
- *
- * @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals.
- * @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict.
- * @return A new sequence dictionary that resulting from merging the two inputs.
- */
- private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) {
-
- // a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict.
- LinkedList holder = new LinkedList();
-
- // Return value will be created from this.
- LinkedList resultingDict = new LinkedList();
- for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) {
- resultingDict.add(sequenceRecord);
- }
-
- // Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
- int prevloc = -1;
- // Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
- SAMSequenceRecord previouslyMerged = null;
-
- for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) {
- // Does it already exist in resultingDict?
- int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName());
- if (loc == -1) {
- // If doesn't already exist in resultingDict, save it an decide where to insert it later.
- holder.add(sequenceRecord.clone());
- } else if (prevloc > loc) {
- // If sequenceRecord already exists in resultingDict, but prior to the previous one
- // from mergeIntoDict that already existed, cannot merge.
- throw new PicardException("Cannot merge sequence dictionaries because sequence " +
- sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() +
- " are in different orders in two input sequence dictionaries.");
- } else {
- // Since sequenceRecord already exists in resultingDict, don't need to add it.
- // Add in all the sequences prior to it that have been held in holder.
- resultingDict.addAll(loc, holder);
- // Remember the index of sequenceRecord so can check for merge imcompatibility.
- prevloc = loc + holder.size();
- previouslyMerged = sequenceRecord;
- holder.clear();
- }
- }
- // Append anything left in holder.
- if (holder.size() != 0) {
- resultingDict.addAll(holder);
- }
- return new SAMSequenceDictionary(resultingDict);
- }
-
- /**
- * Find sequence in list.
- * @param list List to search for the sequence name.
- * @param sequenceName Name to search for.
- * @return Index of SAMSequenceRecord with the given name in list, or -1 if not found.
- */
- private static int getIndexOfSequenceName(final List list, final String sequenceName) {
- for (int i = 0; i < list.size(); ++i) {
- if (list.get(i).getSequenceName().equals(sequenceName)) {
- return i;
- }
- }
- return -1;
- }
-
- /**
- * create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged
- * list of sequence id's.
- * @param headers the collections of headers.
- * @param masterDictionary the superset dictionary we've created.
- */
- private void createSequenceMapping(final Collection headers, SAMSequenceDictionary masterDictionary) {
- LinkedList resultingDictStr = new LinkedList();
- for (SAMSequenceRecord r : masterDictionary.getSequences()) {
- resultingDictStr.add(r.getSequenceName());
- }
- for (final SAMFileHeader header : headers) {
- Map seqMap = new HashMap();
- SAMSequenceDictionary dict = header.getSequenceDictionary();
- for (SAMSequenceRecord rec : dict.getSequences()) {
- seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName()));
- }
- this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap);
- }
- }
-
-
-
- /**
- * Returns the read group id that should be used for the input read and RG id.
- *
- * @deprecated replaced by getReadGroupId(SAMFileHeader, String)
- * */
- public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) {
- return getReadGroupId(reader.getFileHeader(), originalReadGroupId);
- }
-
- /** Returns the read group id that should be used for the input read and RG id. */
- public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) {
- return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId);
- }
-
- /**
- * @param reader one of the input files
- * @param originalProgramGroupId a program group ID from the above input file
- * @return new ID from the merged list of program groups in the output file
- * @deprecated replaced by getProgramGroupId(SAMFileHeader, String)
- */
- public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) {
- return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId);
- }
-
- /**
- * @param header one of the input headers
- * @param originalProgramGroupId a program group ID from the above input file
- * @return new ID from the merged list of program groups in the output file
- */
- public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) {
- return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId);
- }
-
- /** Returns true if there are read group duplicates within the merged headers. */
- public boolean hasReadGroupCollisions() {
- return this.hasReadGroupCollisions;
- }
-
- /** Returns true if there are program group duplicates within the merged headers. */
- public boolean hasProgramGroupCollisions() {
- return hasProgramGroupCollisions;
- }
-
- /** @return if we've merged the sequence dictionaries, return true */
- public boolean hasMergedSequenceDictionary() {
- return hasMergedSequenceDictionary;
- }
-
- /** Returns the merged header that should be written to any output merged file. */
- public SAMFileHeader getMergedHeader() {
- return this.mergedHeader;
- }
-
- /** Returns the collection of readers that this header merger is working with. May return null.
- * @deprecated replaced by getHeaders()
- */
- public Collection getReaders() {
- return this.readers;
- }
-
- /** Returns the collection of readers that this header merger is working with.
- */
- public Collection getHeaders() {
- return this.headers;
- }
-
- /**
- * Tells whether this header merger contains a given SAM file header. Note that header presence
- * is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for
- * reasons of performance.
- * @param header header to check for.
- * @return True if the header exists in this HeaderMerger. False otherwise.
- */
- boolean containsHeader(SAMFileHeader header) {
- for(SAMFileHeader headerMergerHeader: headers) {
- if(headerMergerHeader == header)
- return true;
- }
- return false;
- }
-
- /**
- * returns the new mapping for a specified reader, given it's old sequence index
- * @param reader the reader
- * @param oldReferenceSequenceIndex the old sequence (also called reference) index
- * @return the new index value
- * @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer)
- */
- public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) {
- return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex);
- }
-
- /**
- * Another mechanism for getting the new sequence index, for situations in which the reader is not available.
- * Note that if the SAMRecord has already had its header replaced with the merged header, this won't work.
- * @param header The original header for the input record in question.
- * @param oldReferenceSequenceIndex The original sequence index.
- * @return the new index value that is compatible with the merged sequence index.
- */
- public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) {
- final Map mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header);
- if (mapping == null) {
- throw new PicardException("No sequence dictionary mapping available for header: " + header);
- }
-
- final Integer newIndex = mapping.get(oldReferenceSequenceIndex);
- if (newIndex == null) {
- throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header);
- }
-
- return newIndex;
- }
-
-
- /**
- * Implementations of this interface are used by mergeHeaderRecords(..) to instantiate
- * specific subclasses of AbstractSAMHeaderRecord.
- */
- private static interface HeaderRecordFactory {
-
- /**
- * Constructs a new instance of RecordType.
- * @param id The id of the new record.
- * @param srcRecord Except for the id, the new record will be a copy of this source record.
- */
- public RecordType createRecord(final String id, RecordType srcRecord);
- }
-
- /**
- * Struct that groups together a subclass of AbstractSAMHeaderRecord with the
- * SAMFileHeader that it came from.
- */
- private static class HeaderRecordAndFileHeader {
- private RecordType headerRecord;
- private SAMFileHeader samFileHeader;
-
- public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) {
- this.headerRecord = headerRecord;
- this.samFileHeader = samFileHeader;
- }
-
- public RecordType getHeaderRecord() {
- return headerRecord;
- }
- public SAMFileHeader getFileHeader() {
- return samFileHeader;
- }
- }
-}
diff --git a/public/java/src/net/sf/samtools/BAMFileReader.java b/public/java/src/net/sf/samtools/BAMFileReader.java
deleted file mode 100644
index 5005b6265f..0000000000
--- a/public/java/src/net/sf/samtools/BAMFileReader.java
+++ /dev/null
@@ -1,762 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-package net.sf.samtools;
-
-
-import net.sf.samtools.util.*;
-import net.sf.samtools.SAMFileReader.ValidationStringency;
-
-import java.io.*;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * Internal class for reading and querying BAM files.
- */
-class BAMFileReader extends SAMFileReader.ReaderImplementation {
- // True if reading from a File rather than an InputStream
- private boolean mIsSeekable = false;
-
- // For converting bytes into other primitive types
- private BinaryCodec mStream = null;
-
- // Underlying compressed data stream.
- private final BAMInputStream mInputStream;
- private SAMFileHeader mFileHeader = null;
-
- // Populated if the file is seekable and an index exists
- private File mIndexFile;
- private BAMIndex mIndex = null;
- private long mFirstRecordPointer = 0;
- private CloseableIterator mCurrentIterator = null;
-
- // If true, all SAMRecords are fully decoded as they are read.
- private final boolean eagerDecode;
-
- // For error-checking.
- private ValidationStringency mValidationStringency;
-
- // For creating BAMRecords
- private SAMRecordFactory samRecordFactory;
-
- /**
- * Use the caching index reader implementation rather than the disk-hit-per-file model.
- */
- private boolean mEnableIndexCaching = false;
-
- /**
- * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
- */
- private boolean mEnableIndexMemoryMapping = true;
-
- /**
- * Add information about the origin (reader and position) to SAM records.
- */
- private SAMFileReader mFileReader = null;
-
- /**
- * Prepare to read BAM from a stream (not seekable)
- * @param stream source of bytes.
- * @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
- * @param validationStringency Controls how to handle invalidate reads or header lines.
- */
- BAMFileReader(final InputStream stream,
- final File indexFile,
- final boolean eagerDecode,
- final ValidationStringency validationStringency,
- final SAMRecordFactory factory)
- throws IOException {
- mIndexFile = indexFile;
- mIsSeekable = false;
- mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
- mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
- this.eagerDecode = eagerDecode;
- this.mValidationStringency = validationStringency;
- this.samRecordFactory = factory;
- readHeader(null);
- }
-
- /**
- * Prepare to read BAM from a file (seekable)
- * @param file source of bytes.
- * @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
- * @param validationStringency Controls how to handle invalidate reads or header lines.
- */
- BAMFileReader(final File file,
- final File indexFile,
- final boolean eagerDecode,
- final ValidationStringency validationStringency,
- final SAMRecordFactory factory)
- throws IOException {
- this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
- if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
- System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
- " is older than BAM " + file.getAbsolutePath());
- }
- }
-
- BAMFileReader(final SeekableStream strm,
- final File indexFile,
- final boolean eagerDecode,
- final ValidationStringency validationStringency,
- final SAMRecordFactory factory)
- throws IOException {
- this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
- indexFile,
- eagerDecode,
- strm.getSource(),
- validationStringency,
- factory);
- }
-
- private BAMFileReader(final BAMInputStream inputStream,
- final File indexFile,
- final boolean eagerDecode,
- final String source,
- final ValidationStringency validationStringency,
- final SAMRecordFactory factory)
- throws IOException {
- mIndexFile = indexFile;
- mIsSeekable = true;
- mInputStream = inputStream;
- mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
- this.eagerDecode = eagerDecode;
- this.mValidationStringency = validationStringency;
- this.samRecordFactory = factory;
- readHeader(source);
- mFirstRecordPointer = inputStream.getFilePointer();
- }
-
- /**
- * If true, writes the source of every read into the source SAMRecords.
- * @param enabled true to write source information into each SAMRecord.
- */
- void enableFileSource(final SAMFileReader reader, final boolean enabled) {
- this.mFileReader = enabled ? reader : null;
- }
-
- /**
- * If true, uses the caching version of the index reader.
- * @param enabled true to write source information into each SAMRecord.
- */
- public void enableIndexCaching(final boolean enabled) {
- if(mIndex != null)
- throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
- this.mEnableIndexCaching = enabled;
- }
-
- /**
- * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
- * This is slower but more scalable when accessing large numbers of BAM files sequentially.
- * @param enabled True to use memory mapping, false to use regular I/O.
- */
- public void enableIndexMemoryMapping(final boolean enabled) {
- if (mIndex != null) {
- throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
- }
- this.mEnableIndexMemoryMapping = enabled;
- }
-
- @Override void enableCrcChecking(final boolean enabled) {
- this.mInputStream.setCheckCrcs(enabled);
- }
-
- @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
-
- /**
- * @return true if ths is a BAM file, and has an index
- */
- public boolean hasIndex() {
- return (mIndexFile != null);
- }
-
- /**
- * Retrieves the index for the given file type. Ensure that the index is of the specified type.
- * @return An index of the given type.
- */
- public BAMIndex getIndex() {
- if(mIndexFile == null)
- throw new SAMException("No index is available for this BAM file.");
- if(mIndex == null)
- mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
- : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
- return mIndex;
- }
-
- void close() {
- if (mStream != null) {
- mStream.close();
- }
- if (mIndex != null) {
- mIndex.close();
- }
- mStream = null;
- mFileHeader = null;
- mIndex = null;
- }
-
- SAMFileHeader getFileHeader() {
- return mFileHeader;
- }
-
- /**
- * Set error-checking level for subsequent SAMRecord reads.
- */
- void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
- this.mValidationStringency = validationStringency;
- }
-
- SAMFileReader.ValidationStringency getValidationStringency() {
- return this.mValidationStringency;
- }
-
- /**
- * Prepare to iterate through the SAMRecords in file order.
- * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
- * that iterator must be closed before getIterator() can be called again.
- * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
- * getIterator() begins its iteration where the last one left off. That is the best that can be
- * done in that situation.
- */
- CloseableIterator getIterator() {
- if (mStream == null) {
- throw new IllegalStateException("File reader is closed");
- }
- if (mCurrentIterator != null) {
- throw new IllegalStateException("Iteration in progress");
- }
- if (mIsSeekable) {
- try {
- mInputStream.seek(mFirstRecordPointer);
- } catch (IOException exc) {
- throw new RuntimeException(exc.getMessage(), exc);
- }
- }
- mCurrentIterator = new BAMFileIterator();
- return mCurrentIterator;
- }
-
- @Override
- CloseableIterator getIterator(final SAMFileSpan chunks) {
- if (mStream == null) {
- throw new IllegalStateException("File reader is closed");
- }
- if (mCurrentIterator != null) {
- throw new IllegalStateException("Iteration in progress");
- }
- if (!(chunks instanceof BAMFileSpan)) {
- throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
- }
-
- // Create an iterator over the given chunk boundaries.
- mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
- return mCurrentIterator;
- }
-
- /**
- * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
- * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
- * the rightmost bound is guaranteed to be after the last read in the file.
- * @return An unbounded pointer to the first record in the BAM file.
- */
- @Override
- SAMFileSpan getFilePointerSpanningReads() {
- return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
- }
-
- /**
- * Prepare to iterate through the SAMRecords that match the given interval.
- * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
- * before calling any of the methods that return an iterator.
- *
- * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
- * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
- * matches the specified interval.
- *
- * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
- * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
- *
- * @param sequence Reference sequence sought.
- * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
- * A value of zero implies the start of the reference sequence.
- * @param end A value of zero implies the end of the reference sequence.
- * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
- * specified by start and end. If false, the SAMRecords need only overlap the interval.
- * @return Iterator for the matching SAMRecords
- */
- CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) {
- if (mStream == null) {
- throw new IllegalStateException("File reader is closed");
- }
- if (mCurrentIterator != null) {
- throw new IllegalStateException("Iteration in progress");
- }
- if (!mIsSeekable) {
- throw new UnsupportedOperationException("Cannot query stream-based BAM file");
- }
- mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
- return mCurrentIterator;
- }
-
- /**
- * Prepare to iterate through the SAMRecords with the given alignment start.
- * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
- * before calling any of the methods that return an iterator.
- *
- * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
- * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
- * matches the specified interval.
- *
- * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
- * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
- *
- * @param sequence Reference sequence sought.
- * @param start Alignment start sought.
- * @return Iterator for the matching SAMRecords.
- */
- CloseableIterator queryAlignmentStart(final String sequence, final int start) {
- if (mStream == null) {
- throw new IllegalStateException("File reader is closed");
- }
- if (mCurrentIterator != null) {
- throw new IllegalStateException("Iteration in progress");
- }
- if (!mIsSeekable) {
- throw new UnsupportedOperationException("Cannot query stream-based BAM file");
- }
- mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
- return mCurrentIterator;
- }
-
- public CloseableIterator queryUnmapped() {
- if (mStream == null) {
- throw new IllegalStateException("File reader is closed");
- }
- if (mCurrentIterator != null) {
- throw new IllegalStateException("Iteration in progress");
- }
- if (!mIsSeekable) {
- throw new UnsupportedOperationException("Cannot query stream-based BAM file");
- }
- try {
- final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
- if (startOfLastLinearBin != -1) {
- mInputStream.seek(startOfLastLinearBin);
- } else {
- // No mapped reads in file, just start at the first read in file.
- mInputStream.seek(mFirstRecordPointer);
- }
- mCurrentIterator = new BAMFileIndexUnmappedIterator();
- return mCurrentIterator;
- } catch (IOException e) {
- throw new RuntimeException("IOException seeking to unmapped reads", e);
- }
- }
-
- /**
- * Reads the header from the file or stream
- * @param source Note that this is used only for reporting errors.
- */
- private void readHeader(final String source)
- throws IOException {
-
- final byte[] buffer = new byte[4];
- mStream.readBytes(buffer);
- if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
- throw new IOException("Invalid BAM file header");
- }
-
- final int headerTextLength = mStream.readInt();
- final String textHeader = mStream.readString(headerTextLength);
- final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
- headerCodec.setValidationStringency(mValidationStringency);
- mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
- source);
-
- final int sequenceCount = mStream.readInt();
- if (mFileHeader.getSequenceDictionary().size() > 0) {
- // It is allowed to have binary sequences but no text sequences, so only validate if both are present
- if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
- throw new SAMFormatException("Number of sequences in text header (" +
- mFileHeader.getSequenceDictionary().size() +
- ") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
- }
- for (int i = 0; i < sequenceCount; i++) {
- final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
- final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
- if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
- throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
- source);
- }
- if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
- throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
- source);
- }
- }
- } else {
- // If only binary sequences are present, copy them into mFileHeader
- final List sequences = new ArrayList(sequenceCount);
- for (int i = 0; i < sequenceCount; i++) {
- sequences.add(readSequenceRecord(source));
- }
- mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
- }
- }
-
- /**
- * Reads a single binary sequence record from the file or stream
- * @param source Note that this is used only for reporting errors.
- */
- private SAMSequenceRecord readSequenceRecord(final String source) {
- final int nameLength = mStream.readInt();
- if (nameLength <= 1) {
- throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
- }
- final String sequenceName = mStream.readString(nameLength - 1);
- // Skip the null terminator
- mStream.readByte();
- final int sequenceLength = mStream.readInt();
- return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
- }
-
- /**
- * Iterator for non-indexed sequential iteration through all SAMRecords in file.
- * Starting point of iteration is wherever current file position is when the iterator is constructed.
- */
- private class BAMFileIterator implements CloseableIterator {
- private SAMRecord mNextRecord = null;
- private final BAMRecordCodec bamRecordCodec;
- private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
-
- BAMFileIterator() {
- this(true);
- }
-
- /**
- * @param advance Trick to enable subclass to do more setup before advancing
- */
- BAMFileIterator(final boolean advance) {
- this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
- this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
-
- if (advance) {
- advance();
- }
- }
-
- public void close() {
- if (mCurrentIterator != null && this != mCurrentIterator) {
- throw new IllegalStateException("Attempt to close non-current iterator");
- }
- mCurrentIterator = null;
- }
-
- public boolean hasNext() {
- return (mNextRecord != null);
- }
-
- public SAMRecord next() {
- final SAMRecord result = mNextRecord;
- advance();
- return result;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Not supported: remove");
- }
-
- void advance() {
- try {
- mNextRecord = getNextRecord();
-
- if (mNextRecord != null) {
- ++this.samRecordIndex;
- // Because some decoding is done lazily, the record needs to remember the validation stringency.
- mNextRecord.setValidationStringency(mValidationStringency);
-
- if (mValidationStringency != ValidationStringency.SILENT) {
- final List validationErrors = mNextRecord.isValid();
- SAMUtils.processValidationErrors(validationErrors,
- this.samRecordIndex, BAMFileReader.this.getValidationStringency());
- }
- }
- if (eagerDecode && mNextRecord != null) {
- mNextRecord.eagerDecode();
- }
- } catch (IOException exc) {
- throw new RuntimeException(exc.getMessage(), exc);
- }
- }
-
- /**
- * Read the next record from the input stream.
- */
- SAMRecord getNextRecord() throws IOException {
- final long startCoordinate = mInputStream.getFilePointer();
- final SAMRecord next = bamRecordCodec.decode();
- final long stopCoordinate = mInputStream.getFilePointer();
-
- if(mFileReader != null && next != null)
- next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
-
- return next;
- }
-
- /**
- * @return The record that will be return by the next call to next()
- */
- protected SAMRecord peek() {
- return mNextRecord;
- }
- }
-
- /**
- * Prepare to iterate through SAMRecords matching the target interval.
- * @param sequence Desired reference sequence.
- * @param start 1-based start of target interval, inclusive.
- * @param end 1-based end of target interval, inclusive.
- * @param queryType contained, overlapping, or starting-at query.
- */
- private CloseableIterator createIndexIterator(final String sequence,
- final int start,
- final int end,
- final QueryType queryType) {
- long[] filePointers = null;
-
- // Hit the index to determine the chunk boundaries for the required data.
- final SAMFileHeader fileHeader = getFileHeader();
- final int referenceIndex = fileHeader.getSequenceIndex(sequence);
- if (referenceIndex != -1) {
- final BAMIndex fileIndex = getIndex();
- final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
- filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
- }
-
- // Create an iterator over the above chunk boundaries.
- final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
-
- // Add some preprocessing filters for edge-case reads that don't fit into this
- // query type.
- return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
- }
-
- enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
-
- /**
- * Look for BAM index file according to standard naming convention.
- *
- * @param dataFile BAM file name.
- * @return Index file name, or null if not found.
- */
- private static File findIndexFile(final File dataFile) {
- // If input is foo.bam, look for foo.bai
- final String bamExtension = ".bam";
- File indexFile;
- final String fileName = dataFile.getName();
- if (fileName.endsWith(bamExtension)) {
- final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
- indexFile = new File(dataFile.getParent(), bai);
- if (indexFile.exists()) {
- return indexFile;
- }
- }
-
- // If foo.bai doesn't exist look for foo.bam.bai
- indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
- if (indexFile.exists()) {
- return indexFile;
- } else {
- return null;
- }
- }
-
- private class BAMFileIndexIterator extends BAMFileIterator {
-
- private long[] mFilePointers = null;
- private int mFilePointerIndex = 0;
- private long mFilePointerLimit = -1;
-
- /**
- * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
- * @param filePointers the block / offset combination, stored in chunk format.
- */
- BAMFileIndexIterator(final long[] filePointers) {
- super(false); // delay advance() until after construction
- mFilePointers = filePointers;
- advance();
- }
-
- SAMRecord getNextRecord()
- throws IOException {
- // Advance to next file block if necessary
- while (mInputStream.getFilePointer() >= mFilePointerLimit) {
- if (mFilePointers == null ||
- mFilePointerIndex >= mFilePointers.length) {
- return null;
- }
- final long startOffset = mFilePointers[mFilePointerIndex++];
- final long endOffset = mFilePointers[mFilePointerIndex++];
- mInputStream.seek(startOffset);
- mFilePointerLimit = endOffset;
- }
- // Pull next record from stream
- return super.getNextRecord();
- }
- }
-
- /**
- * A decorating iterator that filters out records that are outside the bounds of the
- * given query parameters.
- */
- private class BAMQueryFilteringIterator implements CloseableIterator {
- /**
- * The wrapped iterator.
- */
- private final CloseableIterator wrappedIterator;
-
- /**
- * The next record to be returned. Will be null if no such record exists.
- */
- private SAMRecord mNextRecord;
-
- private final int mReferenceIndex;
- private final int mRegionStart;
- private final int mRegionEnd;
- private final QueryType mQueryType;
-
- public BAMQueryFilteringIterator(final CloseableIterator iterator,final String sequence, final int start, final int end, final QueryType queryType) {
- this.wrappedIterator = iterator;
- final SAMFileHeader fileHeader = getFileHeader();
- mReferenceIndex = fileHeader.getSequenceIndex(sequence);
- mRegionStart = start;
- if (queryType == QueryType.STARTING_AT) {
- mRegionEnd = mRegionStart;
- } else {
- mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
- }
- mQueryType = queryType;
- mNextRecord = advance();
- }
-
- /**
- * Returns true if a next element exists; false otherwise.
- */
- public boolean hasNext() {
- return mNextRecord != null;
- }
-
- /**
- * Gets the next record from the given iterator.
- * @return The next SAM record in the iterator.
- */
- public SAMRecord next() {
- if(!hasNext())
- throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
- final SAMRecord currentRead = mNextRecord;
- mNextRecord = advance();
- return currentRead;
- }
-
- /**
- * Closes down the existing iterator.
- */
- public void close() {
- if (this != mCurrentIterator) {
- throw new IllegalStateException("Attempt to close non-current iterator");
- }
- mCurrentIterator = null;
- }
-
- /**
- * @throws UnsupportedOperationException always.
- */
- public void remove() {
- throw new UnsupportedOperationException("Not supported: remove");
- }
-
- SAMRecord advance() {
- while (true) {
- // Pull next record from stream
- if(!wrappedIterator.hasNext())
- return null;
-
- final SAMRecord record = wrappedIterator.next();
- // If beyond the end of this reference sequence, end iteration
- final int referenceIndex = record.getReferenceIndex();
- if (referenceIndex != mReferenceIndex) {
- if (referenceIndex < 0 ||
- referenceIndex > mReferenceIndex) {
- return null;
- }
- // If before this reference sequence, continue
- continue;
- }
- if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
- // Quick exit to avoid expensive alignment end calculation
- return record;
- }
- final int alignmentStart = record.getAlignmentStart();
- // If read is unmapped but has a coordinate, return it if the coordinate is within
- // the query region, regardless of whether the mapped mate will be returned.
- final int alignmentEnd;
- if (mQueryType == QueryType.STARTING_AT) {
- alignmentEnd = -1;
- } else {
- alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
- record.getAlignmentEnd(): alignmentStart);
- }
-
- if (alignmentStart > mRegionEnd) {
- // If scanned beyond target region, end iteration
- return null;
- }
- // Filter for overlap with region
- if (mQueryType == QueryType.CONTAINED) {
- if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
- return record;
- }
- } else if (mQueryType == QueryType.OVERLAPPING) {
- if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
- return record;
- }
- } else {
- if (alignmentStart == mRegionStart) {
- return record;
- }
- }
- }
- }
- }
-
- private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
- private BAMFileIndexUnmappedIterator() {
- while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
- advance();
- }
- }
- }
-
-}
diff --git a/public/java/src/net/sf/samtools/GATKChunk.java b/public/java/src/net/sf/samtools/GATKChunk.java
index c48567f6e2..42f0871316 100644
--- a/public/java/src/net/sf/samtools/GATKChunk.java
+++ b/public/java/src/net/sf/samtools/GATKChunk.java
@@ -40,6 +40,10 @@ public GATKChunk(final long start, final long stop) {
super(start,stop);
}
+ public GATKChunk(final long blockStart, final int blockOffsetStart, final long blockEnd, final int blockOffsetEnd) {
+ super(blockStart << 16 | blockOffsetStart,blockEnd << 16 | blockOffsetEnd);
+ }
+
public GATKChunk(final Chunk chunk) {
super(chunk.getChunkStart(),chunk.getChunkEnd());
}
diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala b/public/java/src/net/sf/samtools/PicardNamespaceUtils.java
similarity index 65%
rename from public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala
rename to public/java/src/net/sf/samtools/PicardNamespaceUtils.java
index 7fb96e0741..b645f8fdce 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/AutoIndexGatherFunction.scala
+++ b/public/java/src/net/sf/samtools/PicardNamespaceUtils.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011, The Broad Institute
+ * Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -22,15 +22,18 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-package org.broadinstitute.sting.queue.extensions.gatk
-
-import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
-import org.broadinstitute.sting.queue.function.InProcessFunction
+package net.sf.samtools;
/**
- * A no-op for index files that were automatically generated during the gather step.
- * TODO: Allow graph to know that this isn't needed, and/or that one gather job can actually gather N-outputs, and/or look more into generic source->sinks.
+ * Utils that insist on being in the same package as Picard.
*/
-class AutoIndexGatherFunction extends InProcessFunction with GatherFunction {
- def run() {}
+public class PicardNamespaceUtils {
+ /**
+ * Private constructor only. Do not instantiate.
+ */
+ private PicardNamespaceUtils() {}
+
+ public static void setFileSource(final SAMRecord read, final SAMFileSource fileSource) {
+ read.setFileSource(fileSource);
+ }
}
diff --git a/public/java/src/net/sf/samtools/util/BAMInputStream.java b/public/java/src/net/sf/samtools/util/BAMInputStream.java
deleted file mode 100644
index d825c23d51..0000000000
--- a/public/java/src/net/sf/samtools/util/BAMInputStream.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package net.sf.samtools.util;
-
-import java.io.IOException;
-
-/**
- * An input stream formulated for use reading BAM files. Supports
- */
-public interface BAMInputStream {
- /**
- * Seek to the given position in the file. Note that pos is a special virtual file pointer,
- * not an actual byte offset.
- *
- * @param pos virtual file pointer
- */
- public void seek(final long pos) throws IOException;
-
- /**
- * @return virtual file pointer that can be passed to seek() to return to the current position. This is
- * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
- * the two.
- */
- public long getFilePointer();
-
- /**
- * Determines whether or not the inflater will re-calculated the CRC on the decompressed data
- * and check it against the value stored in the GZIP header. CRC checking is an expensive
- * operation and should be used accordingly.
- */
- public void setCheckCrcs(final boolean check);
-
- public int read() throws java.io.IOException;
-
- public int read(byte[] bytes) throws java.io.IOException;
-
- public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
-
- public long skip(long l) throws java.io.IOException;
-
- public int available() throws java.io.IOException;
-
- public void close() throws java.io.IOException;
-
- public void mark(int i);
-
- public void reset() throws java.io.IOException;
-
- public boolean markSupported();
-}
diff --git a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java b/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java
deleted file mode 100755
index fae2fc89b4..0000000000
--- a/public/java/src/net/sf/samtools/util/BlockCompressedInputStream.java
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * The MIT License
- *
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-package net.sf.samtools.util;
-
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.RandomAccessFile;
-import java.net.URL;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-
-import net.sf.samtools.FileTruncatedException;
-
-/*
- * Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
- * It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
- * The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
- * entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
- *
- * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
- */
-public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
- private InputStream mStream = null;
- private SeekableStream mFile = null;
- private byte[] mFileBuffer = null;
- private byte[] mCurrentBlock = null;
- private int mCurrentOffset = 0;
- private long mBlockAddress = 0;
- private int mLastBlockLength = 0;
- private final BlockGunzipper blockGunzipper = new BlockGunzipper();
-
-
- /**
- * Note that seek() is not supported if this ctor is used.
- */
- public BlockCompressedInputStream(final InputStream stream) {
- mStream = IOUtil.toBufferedStream(stream);
- mFile = null;
- }
-
- /**
- * Use this ctor if you wish to call seek()
- */
- public BlockCompressedInputStream(final File file)
- throws IOException {
- mFile = new SeekableFileStream(file);
- mStream = null;
-
- }
-
- public BlockCompressedInputStream(final URL url) {
- mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
- mStream = null;
- }
-
- /**
- * For providing some arbitrary data source. No additional buffering is
- * provided, so if the underlying source is not buffered, wrap it in a
- * SeekableBufferedStream before passing to this ctor.
- */
- public BlockCompressedInputStream(final SeekableStream strm) {
- mFile = strm;
- mStream = null;
- }
-
- /**
- * Determines whether or not the inflater will re-calculated the CRC on the decompressed data
- * and check it against the value stored in the GZIP header. CRC checking is an expensive
- * operation and should be used accordingly.
- */
- public void setCheckCrcs(final boolean check) {
- this.blockGunzipper.setCheckCrcs(check);
- }
-
- /**
- * @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
- * next caller of a method for this input stream. The next caller might be the same thread or another thread.
- * Note that although the next caller can read this many bytes without blocking, the available() method call itself
- * may block in order to fill an internal buffer if it has been exhausted.
- */
- public int available()
- throws IOException {
- if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
- readBlock();
- }
- if (mCurrentBlock == null) {
- return 0;
- }
- return mCurrentBlock.length - mCurrentOffset;
- }
-
- /**
- * Closes the underlying InputStream or RandomAccessFile
- */
- public void close()
- throws IOException {
- if (mFile != null) {
- mFile.close();
- mFile = null;
- } else if (mStream != null) {
- mStream.close();
- mStream = null;
- }
- // Encourage garbage collection
- mFileBuffer = null;
- mCurrentBlock = null;
- }
-
- /**
- * Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
- * If no byte is available because the end of the stream has been reached, the value -1 is returned.
- * This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
-
- * @return the next byte of data, or -1 if the end of the stream is reached.
- */
- public int read()
- throws IOException {
- return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
- }
-
- /**
- * Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
- * actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
- * or an exception is thrown.
- *
- * read(buf) has the same effect as read(buf, 0, buf.length).
- *
- * @param buffer the buffer into which the data is read.
- * @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
- * the stream has been reached.
- */
- public int read(final byte[] buffer)
- throws IOException {
- return read(buffer, 0, buffer.length);
- }
-
- private volatile ByteArrayOutputStream buf = null;
- private static final byte eol = '\n';
- private static final byte eolCr = '\r';
-
- /**
- * Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
- * carriage return ('\r') or carriage return followed by a line feed ("\r\n").
- *
- * @return A String containing the contents of the line, excluding the line terminating
- * character, or null if the end of the stream has been reached
- *
- * @exception IOException If an I/O error occurs
- * @
- */
- public String readLine() throws IOException {
- int available = available();
- if (available == 0) {
- return null;
- }
- if(null == buf){ // lazy initialisation
- buf = new ByteArrayOutputStream(8192);
- }
- buf.reset();
- boolean done = false;
- boolean foundCr = false; // \r found flag
- while (!done) {
- int linetmpPos = mCurrentOffset;
- int bCnt = 0;
- while((available-- > 0)){
- final byte c = mCurrentBlock[linetmpPos++];
- if(c == eol){ // found \n
- done = true;
- break;
- } else if(foundCr){ // previous char was \r
- --linetmpPos; // current char is not \n so put it back
- done = true;
- break;
- } else if(c == eolCr){ // found \r
- foundCr = true;
- continue; // no ++bCnt
- }
- ++bCnt;
- }
- if(mCurrentOffset < linetmpPos){
- buf.write(mCurrentBlock, mCurrentOffset, bCnt);
- mCurrentOffset = linetmpPos;
- }
- available = available();
- if(available == 0){
- // EOF
- done = true;
- }
- }
- return buf.toString();
- }
-
- /**
- * Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
- * as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
- *
- * This method blocks until input data is available, end of file is detected, or an exception is thrown.
- *
- * @param buffer buffer into which data is read.
- * @param offset the start offset in array b at which the data is written.
- * @param length the maximum number of bytes to read.
- * @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
- * the stream has been reached.
- */
- public int read(final byte[] buffer, int offset, int length)
- throws IOException {
- final int originalLength = length;
- while (length > 0) {
- final int available = available();
- if (available == 0) {
- // Signal EOF to caller
- if (originalLength == length) {
- return -1;
- }
- break;
- }
- final int copyLength = Math.min(length, available);
- System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
- mCurrentOffset += copyLength;
- offset += copyLength;
- length -= copyLength;
- }
- return originalLength - length;
- }
-
- /**
- * Seek to the given position in the file. Note that pos is a special virtual file pointer,
- * not an actual byte offset.
- *
- * @param pos virtual file pointer
- */
- public void seek(final long pos)
- throws IOException {
- if (mFile == null) {
- throw new IOException("Cannot seek on stream based file");
- }
- // Decode virtual file pointer
- // Upper 48 bits is the byte offset into the compressed stream of a block.
- // Lower 16 bits is the byte offset into the uncompressed stream inside the block.
- final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
- final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
- final int available;
- if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
- available = mCurrentBlock.length;
- } else {
- mFile.seek(compressedOffset);
- mBlockAddress = compressedOffset;
- mLastBlockLength = 0;
- readBlock();
- available = available();
- }
- if (uncompressedOffset > available ||
- (uncompressedOffset == available && !eof())) {
- throw new IOException("Invalid file pointer: " + pos);
- }
- mCurrentOffset = uncompressedOffset;
- }
-
- private boolean eof() throws IOException {
- if (mFile.eof()) {
- return true;
- }
- // If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
- return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
- }
-
- /**
- * @return virtual file pointer that can be passed to seek() to return to the current position. This is
- * not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
- * the two.
- */
- public long getFilePointer() {
- if (mCurrentOffset == mCurrentBlock.length) {
- // If current offset is at the end of the current block, file pointer should point
- // to the beginning of the next block.
- return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
- }
- return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
- }
-
- public static long getFileBlock(final long bgzfOffset) {
- return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
- }
-
- /**
- * @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
- * @return true if the given file looks like a valid BGZF file.
- */
- public static boolean isValidFile(final InputStream stream)
- throws IOException {
- if (!stream.markSupported()) {
- throw new RuntimeException("Cannot test non-buffered stream");
- }
- stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
- final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
- final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
- stream.reset();
- return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
- }
-
- private static boolean isValidBlockHeader(final byte[] buffer) {
- return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
- (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
- (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
- buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
- buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
- buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
- }
-
- private void readBlock()
- throws IOException {
-
- if (mFileBuffer == null) {
- mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
- }
- int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
- if (count == 0) {
- // Handle case where there is no empty gzip block at end.
- mCurrentOffset = 0;
- mBlockAddress += mLastBlockLength;
- mCurrentBlock = new byte[0];
- return;
- }
- if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
- throw new IOException("Premature end of file");
- }
- final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
- if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
- throw new IOException("Unexpected compressed block length: " + blockLength);
- }
- final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
- count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
- if (count != remaining) {
- throw new FileTruncatedException("Premature end of file");
- }
- inflateBlock(mFileBuffer, blockLength);
- mCurrentOffset = 0;
- mBlockAddress += mLastBlockLength;
- mLastBlockLength = blockLength;
- }
-
- private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
- throws IOException {
- final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
- byte[] buffer = mCurrentBlock;
- mCurrentBlock = null;
- if (buffer == null || buffer.length != uncompressedLength) {
- try {
- buffer = new byte[uncompressedLength];
- } catch (NegativeArraySizeException e) {
- throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
- }
- }
- blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
- mCurrentBlock = buffer;
- }
-
- private int readBytes(final byte[] buffer, final int offset, final int length)
- throws IOException {
- if (mFile != null) {
- return readBytes(mFile, buffer, offset, length);
- } else if (mStream != null) {
- return readBytes(mStream, buffer, offset, length);
- } else {
- return 0;
- }
- }
-
- private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
- throws IOException {
- int bytesRead = 0;
- while (bytesRead < length) {
- final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
- if (count <= 0) {
- break;
- }
- bytesRead += count;
- }
- return bytesRead;
- }
-
- private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
- throws IOException {
- int bytesRead = 0;
- while (bytesRead < length) {
- final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
- if (count <= 0) {
- break;
- }
- bytesRead += count;
- }
- return bytesRead;
- }
-
- private int unpackInt16(final byte[] buffer, final int offset) {
- return ((buffer[offset] & 0xFF) |
- ((buffer[offset+1] & 0xFF) << 8));
- }
-
- private int unpackInt32(final byte[] buffer, final int offset) {
- return ((buffer[offset] & 0xFF) |
- ((buffer[offset+1] & 0xFF) << 8) |
- ((buffer[offset+2] & 0xFF) << 16) |
- ((buffer[offset+3] & 0xFF) << 24));
- }
-
- public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
-
- public static FileTermination checkTermination(final File file)
- throws IOException {
- final long fileSize = file.length();
- if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
- return FileTermination.DEFECTIVE;
- }
- final RandomAccessFile raFile = new RandomAccessFile(file, "r");
- try {
- raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
- byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
- raFile.readFully(buf);
- if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
- return FileTermination.HAS_TERMINATOR_BLOCK;
- }
- final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
- buf = new byte[bufsize];
- raFile.seek(fileSize - bufsize);
- raFile.read(buf);
- for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
- i >= 0; --i) {
- if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
- buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
- continue;
- }
- final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
- byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
- final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
- if (buf.length - i == totalBlockSizeMinusOne + 1) {
- return FileTermination.HAS_HEALTHY_LAST_BLOCK;
- } else {
- return FileTermination.DEFECTIVE;
- }
- }
- return FileTermination.DEFECTIVE;
- } finally {
- raFile.close();
- }
- }
-
- private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
- for (int i = 0; i < length; ++i) {
- if (preamble[i] != buf[i + startOffset]) {
- return false;
- }
- }
- return true;
- }
-}
-
-
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java
index 73441cb6a4..e453c7f8a9 100644
--- a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java
@@ -41,4 +41,14 @@ public class BWAConfiguration {
* What is the scoring penalty for a gap extension?
*/
public Integer gapExtensionPenalty = null;
+
+ /**
+ * Enter bwa's 'non-stop' mode (equivalent to bwa aln -N parameter).
+ */
+ public Boolean nonStopMode = false;
+
+ /**
+ * Set the max queue size that bwa will use when searching for matches (equivalent to bwa aln -m parameter).
+ */
+ public Integer maxEntriesInQueue = null;
}
diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
index a399867fa7..a999593413 100755
--- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@@ -139,11 +139,11 @@ public class AnalyzeCovariates extends CommandLineProgram {
*/
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
private int MAX_HISTOGRAM_VALUE = 0;
+
@Hidden
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
private boolean DO_INDEL_QUALITY = false;
-
/////////////////////////////
// Private Member Variables
/////////////////////////////
@@ -274,7 +274,6 @@ private void addCSVData(String line) {
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
-
}
private void writeDataTables() {
@@ -341,7 +340,7 @@ private void callRScripts() {
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
- Covariate cov = requestedCovariates.get(iii);
+ final Covariate cov = requestedCovariates.get(iii);
final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
if (DO_INDEL_QUALITY) {
RScriptExecutor executor = new RScriptExecutor();
@@ -349,7 +348,7 @@ private void callRScripts() {
// The second argument is the name of the covariate in order to make the plots look nice
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
executor.exec();
- } else {
+ } else {
if( iii == 1 ) {
// Analyze reported quality
RScriptExecutor executor = new RScriptExecutor();
diff --git a/public/java/src/org/broadinstitute/sting/commandline/Gather.java b/public/java/src/org/broadinstitute/sting/commandline/Gather.java
index 59c3f50cbf..d452f708e0 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/Gather.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/Gather.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2010, The Broad Institute
+ * Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -34,5 +34,6 @@
@Retention(RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface Gather {
- Class value();
+ Class value() default Gather.class;
+ boolean enabled() default true;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
index 32002e0936..e5aaf23385 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@@ -35,9 +35,12 @@
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.walkers.Walker;
-import org.broadinstitute.sting.utils.classloader.JVMUtils;
+import org.broadinstitute.sting.utils.crypt.CryptUtils;
+import org.broadinstitute.sting.utils.crypt.GATKKey;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.ListFileUtils;
+import java.security.PublicKey;
import java.util.*;
/**
@@ -78,6 +81,9 @@ protected int execute() throws Exception {
Walker,?> walker = engine.getWalkerByName(getAnalysisName());
try {
+ // Make sure a valid GATK user key is present, if required.
+ authorizeGATKRun();
+
engine.setArguments(getArgumentCollection());
// File lists can require a bit of additional expansion. Set these explicitly by the engine.
@@ -130,6 +136,28 @@ protected int execute() throws Exception {
return 0;
}
+ /**
+ * Authorizes this run of the GATK by checking for a valid GATK user key, if required.
+ * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options.
+ */
+ private void authorizeGATKRun() {
+ if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET ||
+ getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) {
+ if ( getArgumentCollection().gatkKeyFile == null ) {
+ throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " +
+ "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " +
+ "for more information and instructions on how to obtain a key.");
+ }
+ else {
+ PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey();
+ GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile);
+
+ if ( ! gatkUserKey.isValid() ) {
+ throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile);
+ }
+ }
+ }
+ }
/**
* Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index b4d337d8df..9c59ffe9a9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -25,6 +25,8 @@
package org.broadinstitute.sting.gatk;
+import net.sf.picard.PicardException;
+import net.sf.samtools.SAMException;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
@@ -95,7 +97,11 @@ public static void main(String[] argv) {
// We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are
// lazy loaded, so they aren't caught elsewhere and made into User Exceptions
exitSystemWithUserError(e);
- } catch (net.sf.samtools.SAMException e) {
+ } catch(PicardException e) {
+ // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions?
+ exitSystemWithError(e);
+ }
+ catch (SAMException e) {
checkForTooManyOpenFilesProblem(e.getMessage());
exitSystemWithSamError(e);
} catch (Throwable t) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index f954d76501..50ef4653b9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -53,6 +53,7 @@
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
+import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.io.File;
import java.util.*;
@@ -179,10 +180,18 @@ public void setReferenceMetaDataFiles(Collection referenceMetaDataFi
*/
private static final long GATK_RANDOM_SEED = 47382911L;
private static Random randomGenerator = new Random(GATK_RANDOM_SEED);
-
public static Random getRandomGenerator() { return randomGenerator; }
public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); }
public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); }
+
+ /**
+ * Base Quality Score Recalibration helper object
+ */
+ private BaseRecalibration baseRecalibration = null;
+ public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
+ public boolean hasBaseRecalibration() { return baseRecalibration != null; }
+ public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
+
/**
* Actually run the GATK with the specified walker.
*
@@ -205,6 +214,10 @@ public Object execute() {
if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis());
+ // if the use specified an input BQSR recalibration table then enable on the fly recalibration
+ if (this.getArguments().BQSR_RECAL_FILE != null)
+ setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
+
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();
@@ -224,7 +237,7 @@ public Object execute() {
// create temp directories as necessary
initializeTempDirectory();
- // create the output streams "
+ // create the output streams
initializeOutputStreams(microScheduler.getOutputTracker());
Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
@@ -450,7 +463,15 @@ protected Iterable getShardStrategy(SAMDataSource readsDataSource, Refere
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
- }
+ }
+ else if(walker instanceof ActiveRegionWalker) {
+ if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
+ throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
+ if(intervals == null)
+ return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
+ else
+ return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
+ }
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
// Apply special validation to read pair walkers.
if(walker instanceof ReadPairWalker) {
@@ -749,6 +770,7 @@ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
+ getBaseRecalibration(),
argCollection.defaultBaseQualities);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
index daa8ff60db..db22886ce1 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
@@ -7,6 +7,7 @@
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.utils.baq.BAQ;
+import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.util.Collection;
/**
@@ -27,23 +28,20 @@
* information about how they should be downsampled, sorted, and filtered.
*/
public class ReadProperties {
- private Collection readers = null;
- private SAMFileHeader header = null;
- private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT;
- private DownsamplingMethod downsamplingMethod = null;
- private ValidationExclusion exclusionList = null;
- private Collection supplementalFilters = null;
- private boolean includeReadsWithDeletionAtLoci = false;
- private boolean useOriginalBaseQualities = false;
- private boolean generateExtendedEvents = false;
- private BAQ.CalculationMode cmode = BAQ.CalculationMode.OFF;
- private BAQ.QualityMode qmode = BAQ.QualityMode.DONT_MODIFY;
- IndexedFastaSequenceFile refReader = null; // read for BAQ, if desired
- private byte defaultBaseQualities;
-
- // do we want to generate additional piles of "extended" events (indels)
-// immediately after the reference base such event is associated with?
-
+ private final Collection readers;
+ private final SAMFileHeader header;
+ private final SAMFileReader.ValidationStringency validationStringency;
+ private final DownsamplingMethod downsamplingMethod;
+ private final ValidationExclusion exclusionList;
+ private final Collection supplementalFilters;
+ private final boolean includeReadsWithDeletionAtLoci;
+ private final boolean useOriginalBaseQualities;
+ private final boolean generateExtendedEvents;
+ private final BAQ.CalculationMode cmode;
+ private final BAQ.QualityMode qmode;
+ private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
+ private final BaseRecalibration bqsrApplier;
+ private final byte defaultBaseQualities;
/**
* Return true if the walker wants to see reads that contain deletions when looking at locus pileups
@@ -126,6 +124,8 @@ public IndexedFastaSequenceFile getRefReader() {
return refReader;
}
+ public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
+
/**
* @return Default base quality value to fill reads missing base quality information.
*/
@@ -165,8 +165,9 @@ public ReadProperties( Collection samFiles,
boolean includeReadsWithDeletionAtLoci,
boolean generateExtendedEvents,
BAQ.CalculationMode cmode,
- BAQ.QualityMode qmode,
+ BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
+ BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
this.readers = samFiles;
this.header = header;
@@ -180,6 +181,7 @@ public ReadProperties( Collection samFiles,
this.cmode = cmode;
this.qmode = qmode;
this.refReader = refReader;
+ this.bqsrApplier = bqsrApplier;
this.defaultBaseQualities = defaultBaseQualities;
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 08d2c1ad15..02d211a0cb 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -65,9 +65,12 @@ public GATKArgumentCollection() {
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
- @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false)
+ @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
+ @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
+ public File gatkKeyFile = null;
+
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
public List readFilters = new ArrayList();
@@ -75,6 +78,7 @@ public GATKArgumentCollection() {
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
+ * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
*/
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List> intervals = null;
@@ -185,6 +189,15 @@ public static DownsamplingMethod getDefaultDownsamplingMethod() {
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
public Boolean useOriginalBaseQualities = false;
+ /**
+ * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+ * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+ * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+ * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+ */
+ @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
+ public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
+
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java
index a6731ee184..d1a2e7519b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java
@@ -23,7 +23,7 @@
*/
/**
- * A LocusView over which the user can iterate.
+ * A LocusView over which the user can iterate.
*/
public class AllLocusView extends LocusView {
@@ -47,12 +47,13 @@ public class AllLocusView extends LocusView {
/**
* Create a new queue of locus contexts.
+ *
* @param provider
*/
- public AllLocusView(LocusShardDataProvider provider) {
- super( provider );
+ public AllLocusView(LocusShardDataProvider provider) {
+ super(provider);
// Seed the state tracking members with the first possible seek position and the first possible locus context.
- locusIterator = new GenomeLocusIterator(genomeLocParser,provider.getLocus());
+ locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus());
}
public boolean hasNext() {
@@ -63,7 +64,7 @@ public boolean hasNext() {
public AlignmentContext next() {
advance();
- if(nextPosition == null)
+ if (nextPosition == null)
throw new NoSuchElementException("No next is available in the all locus view");
// Flag to the iterator that no data is waiting in the queue to be processed.
@@ -72,7 +73,7 @@ public AlignmentContext next() {
AlignmentContext currentLocus;
// If actual data is present, return it. Otherwise, return empty data.
- if( nextLocus != null && nextLocus.getLocation().equals(nextPosition) )
+ if (nextLocus != null && nextLocus.getLocation().equals(nextPosition))
currentLocus = nextLocus;
else
currentLocus = createEmptyLocus(nextPosition);
@@ -82,15 +83,15 @@ public AlignmentContext next() {
private void advance() {
// Already at the next element? Don't move forward.
- if(atNextElement)
+ if (atNextElement)
return;
// Out of elements?
- if(nextPosition == null && !locusIterator.hasNext())
- return;
+ if (nextPosition == null && !locusIterator.hasNext())
+ return;
// If nextLocus has been consumed, clear it out to make room for the next incoming locus.
- if(nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) {
+ if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) {
nextLocus = null;
// Determine the next locus. The trick is that we may have more than one alignment context at the same
@@ -98,9 +99,9 @@ private void advance() {
// is still at the current position, we do not increment current position and wait for next call to next() to return
// that context. If we know that next context is past the current position, we are done with current
// position
- if(hasNextLocus()) {
+ if (hasNextLocus()) {
nextLocus = nextLocus();
- if(nextPosition.equals(nextLocus.getLocation())) {
+ if (nextPosition.equals(nextLocus.getLocation())) {
atNextElement = true;
return;
}
@@ -108,7 +109,7 @@ private void advance() {
}
// No elements left in queue? Clear out the position state tracker and return.
- if(!locusIterator.hasNext()) {
+ if (!locusIterator.hasNext()) {
nextPosition = null;
return;
}
@@ -119,9 +120,9 @@ private void advance() {
// Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus
// while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal.
- while(nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) {
+ while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) {
nextLocus = null;
- if(!hasNextLocus())
+ if (!hasNextLocus())
break;
nextLocus = nextLocus();
}
@@ -129,12 +130,15 @@ private void advance() {
/**
* Creates a blank locus context at the specified location.
+ *
* @param site Site at which to create the blank locus context.
* @return empty context.
*/
private final static List EMPTY_PILEUP_READS = Collections.emptyList();
private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList();
- private AlignmentContext createEmptyLocus( GenomeLoc site ) {
- return new AlignmentContext(site,new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS));
+ private final static List EMPTY_DELETION_STATUS = Collections.emptyList();
+
+ private AlignmentContext createEmptyLocus(GenomeLoc site) {
+ return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS));
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
index f9ed0cb747..a3ce6dd278 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
@@ -25,9 +25,14 @@
*/
/**
- * A queue of locus context entries.
+ * The two goals of the LocusView are as follows:
+ * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch
+ * between iterating over all bases in a region, only covered bases in a region covered by
+ * reads, only bases in a region covered by RODs, or any other sort of trigger track
+ * implementation one can think of.
+ * 2) To manage the copious number of iterators that have to be jointly pulled through the
+ * genome to make a locus traversal function.
*/
-
public abstract class LocusView extends LocusIterator implements View {
/**
* The locus bounding this view.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java
similarity index 58%
rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java
rename to public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java
index 0a6173c1e4..1649713658 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderPosition.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMAccessPlan.java
@@ -27,8 +27,10 @@
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
+import net.sf.samtools.util.BlockCompressedFilePointerUtil;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import java.util.LinkedList;
import java.util.List;
/**
@@ -38,7 +40,7 @@
* Time: 10:47 PM
* To change this template use File | Settings | File Templates.
*/
-class SAMReaderPosition {
+class BAMAccessPlan {
private final SAMReaderID reader;
private final BlockInputStream inputStream;
@@ -51,7 +53,7 @@ class SAMReaderPosition {
private long nextBlockAddress;
- SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
+ BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
this.reader = reader;
this.inputStream = inputStream;
@@ -84,11 +86,45 @@ public int getFirstOffsetInBlock() {
}
/**
- * Retrieves the last offset of interest in the block returned by getBlockAddress().
- * @return First block of interest in this segment.
+ * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer.
+ * @param blockAddress Block address for which to search.
+ * @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span.
+ * @return list of chunks containing that block.
*/
- public int getLastOffsetInBlock() {
- return (nextBlockAddress == positionIterator.peek().getBlockEnd()) ? positionIterator.peek().getBlockOffsetEnd() : 65536;
+ public List getSpansOverlappingBlock(long blockAddress, long filePosition) {
+ List spansOverlapping = new LinkedList();
+ // While the position iterator overlaps the given block, pull out spans to report.
+ while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) {
+ // Create a span over as much of the block as is covered by this chunk.
+ int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0;
+
+ // Calculate the end of this span. If the span extends past this block, cap it using the current file position.
+ long blockEnd;
+ int blockOffsetEnd;
+ if(blockAddress < positionIterator.peek().getBlockEnd()) {
+ blockEnd = filePosition;
+ blockOffsetEnd = 0;
+ }
+ else {
+ blockEnd = positionIterator.peek().getBlockEnd();
+ blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd();
+ }
+
+ GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd);
+
+ if(newChunk.getChunkStart() <= newChunk.getChunkEnd())
+ spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd));
+
+ // If the value currently stored in the position iterator ends past the current block, we must be done. Abort.
+ if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress)
+ break;
+
+ // If the position iterator ends before the block ends, pull the position iterator forward.
+ if(positionIterator.peek().getBlockEnd() <= blockAddress)
+ positionIterator.next();
+ }
+
+ return spansOverlapping;
}
public void reset() {
@@ -111,20 +147,16 @@ private void initialize() {
* @param filePosition The current position within the file.
*/
void advancePosition(final long filePosition) {
- nextBlockAddress = filePosition >> 16;
+ nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition);
// Check the current file position against the iterator; if the iterator is before the current file position,
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
- while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
+ while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek()))
positionIterator.next();
- // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block.
- if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) {
- nextBlockAddress = positionIterator.peek().getBlockStart();
- //System.out.printf("SAMReaderPosition: next block address advanced to %d%n",nextBlockAddress);
- break;
- }
- }
+ // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block.
+ if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart())
+ nextBlockAddress = positionIterator.peek().getBlockStart();
// If we've shot off the end of the block pointer, notify consumers that iteration is complete.
if(!positionIterator.hasNext())
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
index 657c70aaa3..1d8879d512 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
@@ -407,7 +407,14 @@ public BAMScheduleEntry next() {
position(currentPosition);
// Read data.
- read(binHeader);
+ int binHeaderBytesRead = read(binHeader);
+
+ // Make sure we read in a complete bin header:
+ if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) {
+ throw new ReviewedStingException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " +
+ "The BAM schedule file is likely incomplete/corrupt.",
+ scheduleFile.getAbsolutePath(), reader.getSamFilePath()));
+ }
// Decode contents.
binHeader.flip();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index bcb726607f..fdc3d2aa73 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -34,6 +34,8 @@
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
@@ -245,7 +247,14 @@ private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc curr
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
// we'll be using the correct contig index for the BAMs.
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
- final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
+ SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig());
+ if ( currentContigSequenceRecord == null ) {
+ throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s",
+ currentLocus.getContig(),
+ ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary())));
+ }
+
+ final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex();
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java
index f468d20204..d75e91bf3a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BGZFBlockLoadingDispatcher.java
@@ -44,12 +44,12 @@ public class BGZFBlockLoadingDispatcher {
private final ExecutorService threadPool;
- private final Queue inputQueue;
+ private final Queue inputQueue;
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
threadPool = Executors.newFixedThreadPool(numThreads);
fileHandleCache = new FileHandleCache(numFileHandles);
- inputQueue = new LinkedList();
+ inputQueue = new LinkedList();
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
}
@@ -58,7 +58,7 @@ public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles
* Initiates a request for a new block load.
* @param readerPosition Position at which to load.
*/
- void queueBlockLoad(final SAMReaderPosition readerPosition) {
+ void queueBlockLoad(final BAMAccessPlan readerPosition) {
synchronized(inputQueue) {
inputQueue.add(readerPosition);
inputQueue.notify();
@@ -69,7 +69,7 @@ void queueBlockLoad(final SAMReaderPosition readerPosition) {
* Claims the next work request from the queue.
* @return The next work request, or null if none is available.
*/
- SAMReaderPosition claimNextWorkRequest() {
+ BAMAccessPlan claimNextWorkRequest() {
synchronized(inputQueue) {
while(inputQueue.isEmpty()) {
try {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java
index cb37bad312..fda5d818c6 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java
@@ -26,24 +26,21 @@
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
-import net.sf.samtools.util.BAMInputStream;
-import net.sf.samtools.util.BlockCompressedFilePointerUtil;
import net.sf.samtools.util.BlockCompressedInputStream;
-import net.sf.samtools.util.RuntimeEOFException;
-import net.sf.samtools.util.SeekableStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
-import java.util.Iterator;
import java.util.LinkedList;
+import java.util.List;
/**
* Presents decompressed blocks to the SAMFileReader.
*/
-public class BlockInputStream extends SeekableStream implements BAMInputStream {
+public class BlockInputStream extends InputStream {
/**
* Mechanism for triggering block loads.
*/
@@ -65,9 +62,9 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
private Throwable error;
/**
- * Current position.
+ * Current accessPlan.
*/
- private SAMReaderPosition position;
+ private BAMAccessPlan accessPlan;
/**
* A stream of compressed data blocks.
@@ -94,11 +91,6 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
*/
private final BlockCompressedInputStream validatingInputStream;
- /**
- * Has the buffer been filled since last request?
- */
- private boolean bufferFilled = false;
-
/**
* Create a new block presenting input stream with a dedicated buffer.
* @param dispatcher the block loading messenger.
@@ -118,7 +110,7 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream {
this.dispatcher = dispatcher;
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
- this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
+ this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
// The block offsets / block positions guarantee that the ending offset/position in the data structure maps to
// the point in the file just following the last read. These two arrays should never be empty; initializing
@@ -151,7 +143,7 @@ public long getFilePointer() {
synchronized(lock) {
// Find the current block within the input stream.
int blockIndex;
- for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() >= blockOffsets.get(blockIndex + 1); blockIndex++)
+ for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++)
;
filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex));
}
@@ -164,51 +156,8 @@ public long getFilePointer() {
return filePointer;
}
- public void seek(long target) {
- //System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
- synchronized(lock) {
- clearBuffers();
-
- // Ensure that the position filled in by submitAccessPlan() is in sync with the seek target just specified.
- position.advancePosition(target);
-
- // If the position advances past the end of the target, that must mean that we seeked to a point at the end
- // of one of the chunk list's subregions. Make a note of our current position and punt on loading any data.
- if(target < position.getBlockAddress() << 16) {
- blockOffsets.clear();
- blockOffsets.add(0);
- blockPositions.clear();
- blockPositions.add(target);
- }
- else {
- waitForBufferFill();
- // A buffer fill will load the relevant data from the shard, but the buffer position still needs to be
- // advanced as appropriate.
- Iterator blockOffsetIterator = blockOffsets.descendingIterator();
- Iterator blockPositionIterator = blockPositions.descendingIterator();
- while(blockOffsetIterator.hasNext() && blockPositionIterator.hasNext()) {
- final int blockOffset = blockOffsetIterator.next();
- final long blockPosition = blockPositionIterator.next();
- if((blockPosition >> 16) == (target >> 16) && (blockPosition&0xFFFF) < (target&0xFFFF)) {
- buffer.position(blockOffset + (int)(target&0xFFFF)-(int)(blockPosition&0xFFFF));
- break;
- }
- }
- }
-
- if(validatingInputStream != null) {
- try {
- validatingInputStream.seek(target);
- }
- catch(IOException ex) {
- throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
- }
- }
- }
- }
-
private void clearBuffers() {
- this.position.reset();
+ this.accessPlan.reset();
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
// Indicate no data to be read.
@@ -225,29 +174,41 @@ private void clearBuffers() {
public boolean eof() {
synchronized(lock) {
// TODO: Handle multiple empty BGZF blocks at end of the file.
- return position != null && (position.getBlockAddress() < 0 || position.getBlockAddress() >= length);
+ return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length);
}
}
- public void setCheckCrcs(final boolean check) {
- // TODO: Implement
- }
-
/**
- * Submits a new access plan for the given dataset.
- * @param position The next seek point for BAM data in this reader.
+ * Submits a new access plan for the given dataset and seeks to the given point.
+ * @param accessPlan The next seek point for BAM data in this reader.
*/
- public void submitAccessPlan(final SAMReaderPosition position) {
+ public void submitAccessPlan(final BAMAccessPlan accessPlan) {
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
- synchronized(lock) {
- // Assume that the access plan is going to tell us to start where we are and move forward.
- // If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
- if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
- position.advancePosition(this.position.getBlockAddress() << 16);
+ this.accessPlan = accessPlan;
+ accessPlan.reset();
+
+ clearBuffers();
+
+ // Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc).
+ // TODO: Don't pass these empty chunks in.
+ accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0));
+
+ if(accessPlan.getBlockAddress() >= 0) {
+ waitForBufferFill();
+ }
+
+ if(validatingInputStream != null) {
+ try {
+ validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0));
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
+ }
}
- this.position = position;
+
}
+
private void compactBuffer() {
// Compact buffer to maximize storage space.
int bytesToRemove = 0;
@@ -286,27 +247,14 @@ private void compactBuffer() {
* Push contents of incomingBuffer into the end of this buffer.
* MUST be called from a thread that is NOT the reader thread.
* @param incomingBuffer The data being pushed into this input stream.
- * @param position target position for the data.
+ * @param accessPlan target access plan for the data.
* @param filePosition the current position of the file pointer
*/
- public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
+ public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) {
synchronized(lock) {
try {
- compactBuffer();
- // Open up the buffer for more reading.
- buffer.limit(buffer.capacity());
-
- // Advance the position to take the most recent read into account.
- final long lastBlockAddress = position.getBlockAddress();
- final int blockOffsetStart = position.getFirstOffsetInBlock();
- final int blockOffsetEnd = position.getLastOffsetInBlock();
-
- // Where did this read end? It either ended in the middle of a block (for a bounding chunk) or it ended at the start of the next block.
- final long endOfRead = (blockOffsetEnd < incomingBuffer.remaining()) ? (lastBlockAddress << 16) | blockOffsetEnd : filePosition << 16;
-
- byte[] validBytes = null;
if(validatingInputStream != null) {
- validBytes = new byte[incomingBuffer.remaining()];
+ byte[] validBytes = new byte[incomingBuffer.remaining()];
byte[] currentBytes = new byte[incomingBuffer.remaining()];
int pos = incomingBuffer.position();
@@ -317,7 +265,7 @@ public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosit
incomingBuffer.position(pos);
long currentFilePointer = validatingInputStream.getFilePointer();
- validatingInputStream.seek(lastBlockAddress << 16);
+ validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0));
validatingInputStream.read(validBytes);
validatingInputStream.seek(currentFilePointer);
@@ -325,33 +273,41 @@ public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosit
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
}
- this.position = position;
- position.advancePosition(filePosition << 16);
+ compactBuffer();
+ // Open up the buffer for more reading.
+ buffer.limit(buffer.capacity());
+
+ // Get the spans overlapping this particular block...
+ List spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition);
+
+ // ...and advance the block
+ this.accessPlan = accessPlan;
+ accessPlan.advancePosition(makeFilePointer(filePosition, 0));
- if(buffer.remaining() < incomingBuffer.remaining()) {
- //System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
+ if(buffer.remaining() < incomingBuffer.remaining())
lock.wait();
- //System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
- }
- // Remove the last position in the list and add in the last read position, in case the two are different.
- blockOffsets.removeLast();
- blockOffsets.add(buffer.position());
- blockPositions.removeLast();
- blockPositions.add(lastBlockAddress << 16 | blockOffsetStart);
+ final int bytesInIncomingBuffer = incomingBuffer.limit();
+
+ for(GATKChunk spanOverlapping: spansOverlapping) {
+ // Clear out the endcap tracking state and add in the starting position for this transfer.
+ blockOffsets.removeLast();
+ blockOffsets.add(buffer.position());
+ blockPositions.removeLast();
+ blockPositions.add(spanOverlapping.getChunkStart());
- // Stream the buffer into the data stream.
- incomingBuffer.position(blockOffsetStart);
- incomingBuffer.limit(Math.min(incomingBuffer.limit(),blockOffsetEnd));
- buffer.put(incomingBuffer);
+ // Stream the buffer into the data stream.
+ incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd());
+ incomingBuffer.position(spanOverlapping.getBlockOffsetStart());
+ buffer.put(incomingBuffer);
- // Then, add the last position read to the very end of the list, just past the end of the last buffer.
- blockOffsets.add(buffer.position());
- blockPositions.add(endOfRead);
+ // Add the endcap for this transfer.
+ blockOffsets.add(buffer.position());
+ blockPositions.add(spanOverlapping.getChunkEnd());
+ }
// Set up the buffer for reading.
buffer.flip();
- bufferFilled = true;
lock.notify();
}
@@ -447,12 +403,8 @@ public int read(byte[] bytes, final int offset, final int length) {
if(remaining < length)
return length - remaining;
- // Otherwise, if at eof(), return -1.
- else if(eof())
- return -1;
-
- // Otherwise, we must've hit a bug in the system.
- throw new ReviewedStingException("BUG: read returned no data, but eof() reports false.");
+ // Otherwise, return -1.
+ return -1;
}
public void close() {
@@ -472,20 +424,26 @@ public String getSource() {
private void waitForBufferFill() {
synchronized(lock) {
- bufferFilled = false;
if(buffer.remaining() == 0 && !eof()) {
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
- dispatcher.queueBlockLoad(position);
+ dispatcher.queueBlockLoad(accessPlan);
try {
lock.wait();
}
catch(InterruptedException ex) {
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
}
-
- if(bufferFilled && buffer.remaining() == 0)
- throw new RuntimeEOFException("No more data left in InputStream");
}
}
}
+
+ /**
+ * Create an encoded BAM file pointer given the address of a BGZF block and an offset.
+ * @param blockAddress Physical address on disk of a BGZF block.
+ * @param blockOffset Offset into the uncompressed data stored in the BGZF block.
+ * @return 64-bit pointer encoded according to the BAM spec.
+ */
+ public static long makeFilePointer(final long blockAddress, final int blockOffset) {
+ return blockAddress << 16 | blockOffset;
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java
index ab42998026..81a37e53ca 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockLoader.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011, The Broad Institute
+ * Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -70,29 +70,29 @@ public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandle
public void run() {
for(;;) {
- SAMReaderPosition readerPosition = null;
+ BAMAccessPlan accessPlan = null;
try {
- readerPosition = dispatcher.claimNextWorkRequest();
- FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
+ accessPlan = dispatcher.claimNextWorkRequest();
+ FileInputStream inputStream = fileHandleCache.claimFileInputStream(accessPlan.getReader());
- long blockAddress = readerPosition.getBlockAddress();
+ //long blockAddress = readerPosition.getBlockAddress();
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
- ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
+ ByteBuffer compressedBlock = readBGZFBlock(inputStream,accessPlan.getBlockAddress());
long nextBlockAddress = position(inputStream);
- fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
+ fileHandleCache.releaseFileInputStream(accessPlan.getReader(),inputStream);
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
int bytesCopied = block.remaining();
- BlockInputStream bamInputStream = readerPosition.getInputStream();
- bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
+ BlockInputStream bamInputStream = accessPlan.getInputStream();
+ bamInputStream.copyIntoBuffer(block,accessPlan,nextBlockAddress);
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
}
catch(Throwable error) {
- if(readerPosition != null && readerPosition.getInputStream() != null)
- readerPosition.getInputStream().reportException(error);
+ if(accessPlan != null && accessPlan.getInputStream() != null)
+ accessPlan.getInputStream().reportException(error);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
index 244438a593..2bf75b0357 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
@@ -25,6 +25,7 @@
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileInputStream;
@@ -349,7 +350,18 @@ private long[] readLongs(final int count) {
private void read(final ByteBuffer buffer) {
try {
- fileChannel.read(buffer);
+ int bytesExpected = buffer.limit();
+ int bytesRead = fileChannel.read(buffer);
+
+ // We have a rigid expectation here to read in exactly the number of bytes we've limited
+ // our buffer to -- if we read in fewer bytes than this, or encounter EOF (-1), the index
+ // must be truncated or otherwise corrupt:
+ if ( bytesRead < bytesExpected ) {
+ throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " +
+ "It's likely that this file is truncated or corrupt -- " +
+ "Please try re-indexing the corresponding BAM file.",
+ mFile));
+ }
}
catch(IOException ex) {
throw new ReviewedStingException("Index: unable to read bytes from index file " + mFile);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java
new file mode 100644
index 0000000000..4005f1c321
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.util.CloseableIterator;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * High efficiency filtering iterator designed to filter out reads only included
+ * in the query results due to the granularity of the BAM index.
+ *
+ * Built into the BAM index is a notion of 16kbase granularity -- an index query for
+ * two regions contained within a 16kbase chunk (say, chr1:5-10 and chr1:11-20) will
+ * return exactly the same regions within the BAM file. This iterator is optimized
+ * to subtract out reads which do not at all overlap the interval list passed to the
+ * constructor.
+ *
+ * Example:
+ * interval list: chr20:6-10
+ * Reads that would pass through the filter: chr20:6-10, chr20:1-15, chr20:1-7, chr20:8-15.
+ * Reads that would be discarded by the filter: chr20:1-5, chr20:11-15.
+ */
+class IntervalOverlapFilteringIterator implements CloseableIterator {
+ /**
+ * The wrapped iterator.
+ */
+ private CloseableIterator iterator;
+
+ /**
+ * The next read, queued up and ready to go.
+ */
+ private SAMRecord nextRead;
+
+ /**
+ * Rather than using the straight genomic bounds, use filter out only mapped reads.
+ */
+ private boolean keepOnlyUnmappedReads;
+
+ /**
+ * Custom representation of interval bounds.
+ * Makes it simpler to track current position.
+ */
+ private int[] intervalContigIndices;
+ private int[] intervalStarts;
+ private int[] intervalEnds;
+
+ /**
+ * Position within the interval list.
+ */
+ private int currentBound = 0;
+
+ public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) {
+ this.iterator = iterator;
+
+ // Look at the interval list to detect whether we should worry about unmapped reads.
+ // If we find a mix of mapped/unmapped intervals, throw an exception.
+ boolean foundMappedIntervals = false;
+ for(GenomeLoc location: intervals) {
+ if(! GenomeLoc.isUnmapped(location))
+ foundMappedIntervals = true;
+ keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
+ }
+
+
+ if(foundMappedIntervals) {
+ if(keepOnlyUnmappedReads)
+ throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
+ this.intervalContigIndices = new int[intervals.size()];
+ this.intervalStarts = new int[intervals.size()];
+ this.intervalEnds = new int[intervals.size()];
+ int i = 0;
+ for(GenomeLoc interval: intervals) {
+ intervalContigIndices[i] = interval.getContigIndex();
+ intervalStarts[i] = interval.getStart();
+ intervalEnds[i] = interval.getStop();
+ i++;
+ }
+ }
+
+ advance();
+ }
+
+ public boolean hasNext() {
+ return nextRead != null;
+ }
+
+ public SAMRecord next() {
+ if(nextRead == null)
+ throw new NoSuchElementException("No more reads left in this iterator.");
+ SAMRecord currentRead = nextRead;
+ advance();
+ return currentRead;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
+ }
+
+
+ public void close() {
+ iterator.close();
+ }
+
+ private void advance() {
+ nextRead = null;
+
+ if(!iterator.hasNext())
+ return;
+
+ SAMRecord candidateRead = iterator.next();
+ while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
+ if(!keepOnlyUnmappedReads) {
+ // Mapped read filter; check against GenomeLoc-derived bounds.
+ if(readEndsOnOrAfterStartingBound(candidateRead)) {
+ // This read ends after the current interval begins.
+ // Promising, but this read must be checked against the ending bound.
+ if(readStartsOnOrBeforeEndingBound(candidateRead)) {
+ // Yes, this read is within both bounds. This must be our next read.
+ nextRead = candidateRead;
+ break;
+ }
+ else {
+ // Oops, we're past the end bound. Increment the current bound and try again.
+ currentBound++;
+ continue;
+ }
+ }
+ }
+ else {
+ // Found an unmapped read. We're done.
+ if(candidateRead.getReadUnmappedFlag()) {
+ nextRead = candidateRead;
+ break;
+ }
+ }
+
+ // No more reads available. Stop the search.
+ if(!iterator.hasNext())
+ break;
+
+ // No reasonable read found; advance the iterator.
+ candidateRead = iterator.next();
+ }
+ }
+
+ /**
+ * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
+ * end will be distorted, so rely only on the alignment start.
+ * @param read The read to position-check.
+ * @return True if the read starts after the current bounds. False otherwise.
+ */
+ private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
+ return
+ // Read ends on a later contig, or...
+ read.getReferenceIndex() > intervalContigIndices[currentBound] ||
+ // Read ends of this contig...
+ (read.getReferenceIndex() == intervalContigIndices[currentBound] &&
+ // either after this location, or...
+ (read.getAlignmentEnd() >= intervalStarts[currentBound] ||
+ // read is unmapped but positioned and alignment start is on or after this start point.
+ (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
+ }
+
+ /**
+ * Check whether the read lies before the end of the current bound.
+ * @param read The read to position-check.
+ * @return True if the read starts after the current bounds. False otherwise.
+ */
+ private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
+ return
+ // Read starts on a prior contig, or...
+ read.getReferenceIndex() < intervalContigIndices[currentBound] ||
+ // Read starts on this contig and the alignment start is registered before this end point.
+ (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
index 8d73b1b158..96b55674ae 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
@@ -36,7 +36,7 @@
*/
public class ReadShard extends Shard {
/**
- * What is the maximum number of reads which should go into a read shard.
+ * What is the maximum number of reads per BAM file which should go into a read shard.
*/
public static int MAX_READS = 10000;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
index b215763b53..bf7afe4f0d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -39,7 +39,6 @@
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
-import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.SimpleTimer;
@@ -47,6 +46,8 @@
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
+import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import java.io.File;
@@ -202,6 +203,7 @@ public SAMDataSource(
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
+ null, // no BQSR
(byte) -1);
}
@@ -238,6 +240,7 @@ public SAMDataSource(
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
+ BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
@@ -310,6 +313,7 @@ public SAMDataSource(
cmode,
qmode,
refReader,
+ bqsrApplier,
defaultBaseQualities);
// cache the read group id (original) -> read group id (merged)
@@ -555,7 +559,7 @@ private void initializeReaderPositions(SAMReaders readers) {
*/
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
// Set up merging to dynamically merge together multiple BAMs.
- MergingSamRecordIterator mergingIterator = readers.createMergingIterator();
+ Map> iteratorMap = new HashMap>();
for(SAMReaderID id: getReaderIDs()) {
CloseableIterator iterator = null;
@@ -567,15 +571,23 @@ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean en
if(threadAllocation.getNumIOThreads() > 0) {
BlockInputStream inputStream = readers.getInputStream(id);
- inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
+ inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id)));
+ BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory);
+ codec.setInputStream(inputStream);
+ iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec);
}
- iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
+ else {
+ iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
+ }
+
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
- mergingIterator.addIterator(readers.getReader(id),iterator);
+ iteratorMap.put(readers.getReader(id), iterator);
}
+ MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap);
+
return applyDecoratingIterators(shard.getReadMetrics(),
enableVerification,
readProperties.useOriginalBaseQualities(),
@@ -586,9 +598,53 @@ private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean en
readProperties.getBAQCalculationMode(),
readProperties.getBAQQualityMode(),
readProperties.getRefReader(),
+ readProperties.getBQSRApplier(),
readProperties.defaultBaseQualities());
}
+ private class BAMCodecIterator implements CloseableIterator {
+ private final BlockInputStream inputStream;
+ private final SAMFileReader reader;
+ private final BAMRecordCodec codec;
+ private SAMRecord nextRead;
+
+ private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) {
+ this.inputStream = inputStream;
+ this.reader = reader;
+ this.codec = codec;
+ advance();
+ }
+
+ public boolean hasNext() {
+ return nextRead != null;
+ }
+
+ public SAMRecord next() {
+ if(!hasNext())
+ throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty");
+ SAMRecord currentRead = nextRead;
+ advance();
+ return currentRead;
+ }
+
+ public void close() {
+ // NO-OP.
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator");
+ }
+
+ private void advance() {
+ final long startCoordinate = inputStream.getFilePointer();
+ nextRead = codec.decode();
+ final long stopCoordinate = inputStream.getFilePointer();
+
+ if(reader != null && nextRead != null)
+ PicardNamespaceUtils.setFileSource(nextRead,new SAMFileSource(reader,new GATKBAMFileSpan(new GATKChunk(startCoordinate,stopCoordinate))));
+ }
+ }
+
/**
* Filter reads based on user-specified criteria.
*
@@ -612,9 +668,10 @@ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
+ BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
- if ( useOriginalBaseQualities || defaultBaseQualities >= 0 )
- // only wrap if we are replacing the original qualitiies or using a default base quality
+ if (useOriginalBaseQualities || defaultBaseQualities >= 0)
+ // only wrap if we are replacing the original qualities or using a default base quality
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
// NOTE: this (and other filtering) should be done before on-the-fly sorting
@@ -627,6 +684,9 @@ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics,
if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
+ if (bqsrApplier != null)
+ wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
+
if (cmode != BAQ.CalculationMode.OFF)
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);
@@ -796,8 +856,13 @@ public String getReadGroupId(final SAMReaderID readerID, final String originalRe
return headerMerger.getReadGroupId(header,originalReadGroupID);
}
- public MergingSamRecordIterator createMergingIterator() {
- return new MergingSamRecordIterator(headerMerger,readers.values(),true);
+ /**
+ * Creates a new merging iterator from the given map, with the given header.
+ * @param iteratorMap A map of readers to iterators.
+ * @return An iterator which will merge those individual iterators.
+ */
+ public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) {
+ return new MergingSamRecordIterator(headerMerger,iteratorMap,true);
}
/**
@@ -863,12 +928,9 @@ public ReaderInitializer(final SAMReaderID readerID) {
public ReaderInitializer call() {
final File indexFile = findIndexFile(readerID.samFile);
try {
- if (threadAllocation.getNumIOThreads() > 0) {
+ if (threadAllocation.getNumIOThreads() > 0)
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
- reader = new SAMFileReader(blockInputStream,indexFile,false);
- }
- else
- reader = new SAMFileReader(readerID.samFile,indexFile,false);
+ reader = new SAMFileReader(readerID.samFile,indexFile,false);
} catch ( RuntimeIOException e ) {
if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException )
throw new UserException.CouldNotReadInputFile(readerID.samFile, e);
@@ -927,167 +989,6 @@ public SAMRecord next() {
*/
private class ReadGroupMapping extends HashMap {}
- /**
- * Filters out reads that do not overlap the current GenomeLoc.
- * Note the custom implementation: BAM index querying returns all reads that could
- * possibly overlap the given region (and quite a few extras). In order not to drag
- * down performance, this implementation is highly customized to its task.
- */
- private class IntervalOverlapFilteringIterator implements CloseableIterator {
- /**
- * The wrapped iterator.
- */
- private CloseableIterator iterator;
-
- /**
- * The next read, queued up and ready to go.
- */
- private SAMRecord nextRead;
-
- /**
- * Rather than using the straight genomic bounds, use filter out only mapped reads.
- */
- private boolean keepOnlyUnmappedReads;
-
- /**
- * Custom representation of interval bounds.
- * Makes it simpler to track current position.
- */
- private int[] intervalContigIndices;
- private int[] intervalStarts;
- private int[] intervalEnds;
-
- /**
- * Position within the interval list.
- */
- private int currentBound = 0;
-
- public IntervalOverlapFilteringIterator(CloseableIterator iterator, List intervals) {
- this.iterator = iterator;
-
- // Look at the interval list to detect whether we should worry about unmapped reads.
- // If we find a mix of mapped/unmapped intervals, throw an exception.
- boolean foundMappedIntervals = false;
- for(GenomeLoc location: intervals) {
- if(! GenomeLoc.isUnmapped(location))
- foundMappedIntervals = true;
- keepOnlyUnmappedReads |= GenomeLoc.isUnmapped(location);
- }
-
-
- if(foundMappedIntervals) {
- if(keepOnlyUnmappedReads)
- throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads");
- this.intervalContigIndices = new int[intervals.size()];
- this.intervalStarts = new int[intervals.size()];
- this.intervalEnds = new int[intervals.size()];
- int i = 0;
- for(GenomeLoc interval: intervals) {
- intervalContigIndices[i] = interval.getContigIndex();
- intervalStarts[i] = interval.getStart();
- intervalEnds[i] = interval.getStop();
- i++;
- }
- }
-
- advance();
- }
-
- public boolean hasNext() {
- return nextRead != null;
- }
-
- public SAMRecord next() {
- if(nextRead == null)
- throw new NoSuchElementException("No more reads left in this iterator.");
- SAMRecord currentRead = nextRead;
- advance();
- return currentRead;
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Cannot remove from an IntervalOverlapFilteringIterator");
- }
-
-
- public void close() {
- iterator.close();
- }
-
- private void advance() {
- nextRead = null;
-
- if(!iterator.hasNext())
- return;
-
- SAMRecord candidateRead = iterator.next();
- while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) {
- if(!keepOnlyUnmappedReads) {
- // Mapped read filter; check against GenomeLoc-derived bounds.
- if(readEndsOnOrAfterStartingBound(candidateRead)) {
- // This read ends after the current interval begins.
- // Promising, but this read must be checked against the ending bound.
- if(readStartsOnOrBeforeEndingBound(candidateRead)) {
- // Yes, this read is within both bounds. This must be our next read.
- nextRead = candidateRead;
- break;
- }
- else {
- // Oops, we're past the end bound. Increment the current bound and try again.
- currentBound++;
- continue;
- }
- }
- }
- else {
- // Found an unmapped read. We're done.
- if(candidateRead.getReadUnmappedFlag()) {
- nextRead = candidateRead;
- break;
- }
- }
-
- // No more reads available. Stop the search.
- if(!iterator.hasNext())
- break;
-
- // No reasonable read found; advance the iterator.
- candidateRead = iterator.next();
- }
- }
-
- /**
- * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its
- * end will be distorted, so rely only on the alignment start.
- * @param read The read to position-check.
- * @return True if the read starts after the current bounds. False otherwise.
- */
- private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) {
- return
- // Read ends on a later contig, or...
- read.getReferenceIndex() > intervalContigIndices[currentBound] ||
- // Read ends of this contig...
- (read.getReferenceIndex() == intervalContigIndices[currentBound] &&
- // either after this location, or...
- (read.getAlignmentEnd() >= intervalStarts[currentBound] ||
- // read is unmapped but positioned and alignment start is on or after this start point.
- (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
- }
-
- /**
- * Check whether the read lies before the end of the current bound.
- * @param read The read to position-check.
- * @return True if the read starts after the current bounds. False otherwise.
- */
- private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) {
- return
- // Read starts on a prior contig, or...
- read.getReferenceIndex() < intervalContigIndices[currentBound] ||
- // Read starts on this contig and the alignment start is registered before this end point.
- (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
- }
- }
-
/**
* Locates the index file alongside the given BAM, if present.
* TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
index 39e1bdc726..433c7d82fb 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
@@ -11,7 +11,7 @@
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
import java.util.Collection;
@@ -102,7 +102,7 @@ public Object execute( Walker walker, Iterable shardStrategy ) {
while (isShardTraversePending() || isTreeReducePending()) {
// Check for errors during execution.
if(hasTraversalErrorOccurred())
- throw new ReviewedStingException("An error has occurred during the traversal.",getTraversalError());
+ throw getTraversalError();
// Too many files sitting around taking up space? Merge them.
if (isMergeLimitExceeded())
@@ -345,10 +345,15 @@ private synchronized boolean hasTraversalErrorOccurred() {
return error != null;
}
- private synchronized Throwable getTraversalError() {
+ private synchronized StingException getTraversalError() {
if(!hasTraversalErrorOccurred())
throw new ReviewedStingException("User has attempted to retrieve a traversal error when none exists");
- return error;
+
+ // If the error is already a StingException, pass it along as is. Otherwise, wrap it.
+ if(error instanceof StingException)
+ return (StingException)error;
+ else
+ return new ReviewedStingException("An error occurred during the traversal.",error);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index ff5e1064bd..16487054bc 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -10,6 +10,7 @@
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
+import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.SampleUtils;
@@ -55,7 +56,6 @@ public Object execute(Walker walker, Iterable shardStrategy) {
traversalEngine.startTimersIfNecessary();
if(shard.getShardType() == Shard.ShardType.LOCUS) {
- LocusWalker lWalker = (LocusWalker)walker;
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
@@ -77,6 +77,12 @@ public Object execute(Walker walker, Iterable shardStrategy) {
done = walker.isDone();
}
+ // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine
+ if( traversalEngine instanceof TraverseActiveRegions ) {
+ final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit());
+ accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator
+ }
+
Object result = accumulator.finishTraversal();
printOnTraversalDone(result);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index d013db7e84..5080997084 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -128,6 +128,8 @@ protected MicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSour
traversalEngine = new TraverseDuplicates();
} else if (walker instanceof ReadPairWalker) {
traversalEngine = new TraverseReadPairs();
+ } else if (walker instanceof ActiveRegionWalker) {
+ traversalEngine = new TraverseActiveRegions();
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
index d1f5d80daf..da11d36ddd 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
@@ -17,9 +17,21 @@
import java.util.NoSuchElementException;
/**
- * Buffer shards of data which may or may not contain multiple loci into
- * iterators of all data which cover an interval. Its existence is an homage
- * to Mark's stillborn WindowMaker, RIP 2009.
+ * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci
+ * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp
+ * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of
+ * loci to only those covered by the given interval list.
+ *
+ * Example:
+ * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10
+ * Incoming intervals: chr20:3-7
+ *
+ * Locus iterator by state will produce the following stream of data:
+ * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E},
+ * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E}
+ *
+ * WindowMakerIterator will then filter the incoming stream, emitting the following stream:
+ * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E}
*
* @author mhanna
* @version 0.1
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
index 75e787e05a..a47c61d0b8 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
@@ -49,9 +49,13 @@
import java.util.*;
-/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */
+/**
+ * Iterator that traverses a SAM File, accumulating information on a per-locus basis
+ */
public class LocusIteratorByState extends LocusIterator {
- /** our log, which we want to capture anything from this class */
+ /**
+ * our log, which we want to capture anything from this class
+ */
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
// -----------------------------------------------------------------------------------------------------------------
@@ -70,16 +74,15 @@ public class LocusIteratorByState extends LocusIterator {
static private class SAMRecordState {
SAMRecord read;
- int readOffset = -1; // how far are we offset from the start of the read bases?
- int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
+ int readOffset = -1; // how far are we offset from the start of the read bases?
+ int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
Cigar cigar = null;
int cigarOffset = -1;
CigarElement curElement = null;
int nCigarElements = 0;
- // how far are we into a single cigarElement
- int cigarElementCounter = -1;
+ int cigarElementCounter = -1; // how far are we into a single cigarElement
// The logical model for generating extended events is as follows: the "record state" implements the traversal
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
@@ -89,17 +92,19 @@ static private class SAMRecordState {
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
- boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
- // the only purpose of this flag is to shield away a few additional lines of code
- // when extended piles are not needed, it may not be even worth it...
- byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
- int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
- byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
- // current base on the ref. We use a counter-like variable here since clearing the indel event is
- // delayed by one base, so we need to remember how long ago we have seen the actual event
- int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
- // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
- // we cache it here mainly for convenience
+ boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
+ // the only purpose of this flag is to shield away a few additional lines of code
+ // when extended piles are not needed, it may not be even worth it...
+
+ byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
+ int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
+ byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
+ // current base on the ref. We use a counter-like variable here since clearing the indel event is
+ // delayed by one base, so we need to remember how long ago we have seen the actual event
+
+ int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
+ // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
+ // we cache it here mainly for convenience
public SAMRecordState(SAMRecord read, boolean extended) {
@@ -111,23 +116,31 @@ public SAMRecordState(SAMRecord read, boolean extended) {
//System.out.printf("Creating a SAMRecordState: %s%n", this);
}
- public SAMRecord getRead() { return read; }
+ public SAMRecord getRead() {
+ return read;
+ }
/**
* What is our current offset in the read's bases that aligns us with the reference genome?
*
* @return
*/
- public int getReadOffset() { return readOffset; }
+ public int getReadOffset() {
+ return readOffset;
+ }
/**
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
*
* @return
*/
- public int getGenomeOffset() { return genomeOffset; }
+ public int getGenomeOffset() {
+ return genomeOffset;
+ }
- public int getGenomePosition() { return read.getAlignmentStart() + getGenomeOffset(); }
+ public int getGenomePosition() {
+ return read.getAlignmentStart() + getGenomeOffset();
+ }
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
@@ -137,52 +150,66 @@ public CigarOperator getCurrentCigarOperator() {
return curElement.getOperator();
}
- /** Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
+ /**
+ * Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
*
* @return
*/
public boolean hadIndel() {
- return ( eventLength > 0 );
+ return (eventLength > 0);
}
- public int getEventLength() { return eventLength; }
+ public int getEventLength() {
+ return eventLength;
+ }
- public byte[] getEventBases() { return insertedBases; }
+ public byte[] getEventBases() {
+ return insertedBases;
+ }
- public int getReadEventStartOffset() { return eventStart; }
+ public int getReadEventStartOffset() {
+ return eventStart;
+ }
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
+ public CigarElement peekForwardOnGenome() {
+ return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
+ }
+
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
- if ( curElement == null || ++cigarElementCounter > curElement.getLength() ) {
+ if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
cigarOffset++;
- if ( cigarOffset < nCigarElements ) {
+ if (cigarOffset < nCigarElements) {
curElement = cigar.getCigarElement(cigarOffset);
cigarElementCounter = 0;
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
// we reenter in order to re-check cigarElementCounter against curElement's length
return stepForwardOnGenome();
} else {
+ if (curElement != null && curElement.getOperator() == CigarOperator.D)
+ throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
+
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
// current offset of this read is the next ref base after the end of the indel. This position will
// model a point on the reference somewhere after the end of the read.
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
- // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
+ // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
- if ( generateExtendedEvents && eventDelayedFlag > 0 ) {
+ if (generateExtendedEvents && eventDelayedFlag > 0) {
// if we had an indel right before the read ended (i.e. insertion was the last cigar element),
// we keep it until next reference base; then we discard it and this will allow the LocusIterator to
// finally discard this read
eventDelayedFlag--;
- if ( eventDelayedFlag == 0 ) {
+ if (eventDelayedFlag == 0) {
eventLength = -1; // reset event when we are past it
insertedBases = null;
eventStart = -1;
@@ -193,34 +220,37 @@ public CigarOperator stepForwardOnGenome() {
}
}
-
boolean done = false;
switch (curElement.getOperator()) {
- case H : // ignore hard clips
- case P : // ignore pads
+ case H: // ignore hard clips
+ case P: // ignore pads
cigarElementCounter = curElement.getLength();
break;
- case I : // insertion w.r.t. the reference
- if ( generateExtendedEvents ) {
+ case I: // insertion w.r.t. the reference
+ if (generateExtendedEvents) {
// we see insertions only once, when we step right onto them; the position on the read is scrolled
// past the insertion right after that
- if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName());
- insertedBases = Arrays.copyOfRange(read.getReadBases(),readOffset+1,readOffset+1+curElement.getLength());
- eventLength = curElement.getLength() ;
+ if (eventDelayedFlag > 1)
+ throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
+ insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
+ eventLength = curElement.getLength();
eventStart = readOffset;
eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2
// System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset);
} // continue onto the 'S' case !
- case S : // soft clip
+ case S: // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
break;
- case D : // deletion w.r.t. the reference
- if ( generateExtendedEvents ) {
- if ( cigarElementCounter == 1) {
+ case D: // deletion w.r.t. the reference
+ if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
+ throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
+ if (generateExtendedEvents) {
+ if (cigarElementCounter == 1) {
// generate an extended event only if we just stepped into the deletion (i.e. don't
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
- if ( eventDelayedFlag > 1 ) throw new UserException.MalformedBAM(read, "Adjacent I/D events in read "+read.getReadName());
+ if (eventDelayedFlag > 1)
+ throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
eventLength = curElement.getLength();
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
eventStart = readOffset;
@@ -232,26 +262,27 @@ public CigarOperator stepForwardOnGenome() {
genomeOffset++;
done = true;
break;
- case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
+ case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
- case M :
+ case M:
readOffset++;
genomeOffset++;
done = true;
break;
- default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
+ default:
+ throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
- if ( generateExtendedEvents ) {
- if ( eventDelayedFlag > 0 && done ) {
- // if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementthe,
+ if (generateExtendedEvents) {
+ if (eventDelayedFlag > 0 && done) {
+ // if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementing the,
// the flag is 1, we are standing on the reference base right after the indel (so we have to keep it).
// Otherwise, we are away from the previous indel and have to clear our memories...
eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now
- // if eventDelayedFlag == 1, an indel occured right before the current base
- if ( eventDelayedFlag == 0 ) {
+ // if eventDelayedFlag == 1, an indel occured right before the current base
+ if (eventDelayedFlag == 0) {
eventLength = -1; // reset event when we are past it
insertedBases = null;
eventStart = -1;
@@ -274,15 +305,15 @@ public CigarOperator stepForwardOnGenome() {
//
// -----------------------------------------------------------------------------------------------------------------
- public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) {
+ public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) {
this.readInfo = readInformation;
this.genomeLocParser = genomeLocParser;
this.samples = new ArrayList(samples);
- this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
+ this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod());
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
- if ( this.samples.isEmpty() && samIterator.hasNext() ) {
+ if (this.samples.isEmpty() && samIterator.hasNext()) {
throw new IllegalArgumentException("samples list must not be empty");
}
}
@@ -322,7 +353,7 @@ private GenomeLoc getLocation() {
// -----------------------------------------------------------------------------------------------------------------
public AlignmentContext next() {
lazyLoadNextAlignmentContext();
- if(!hasNext())
+ if (!hasNext())
throw new NoSuchElementException("LocusIteratorByState: out of elements.");
AlignmentContext currentAlignmentContext = nextAlignmentContext;
nextAlignmentContext = null;
@@ -334,7 +365,7 @@ public AlignmentContext next() {
* nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
*/
private void lazyLoadNextAlignmentContext() {
- while(nextAlignmentContext == null && readStates.hasNext()) {
+ while (nextAlignmentContext == null && readStates.hasNext()) {
// this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref:
readStates.collectPendingReads();
@@ -350,14 +381,14 @@ private void lazyLoadNextAlignmentContext() {
// In this case, the subsequent call to next() will emit the normal pileup at the current base
// and shift the position.
if (readInfo.generateExtendedEvents() && hasExtendedEvents) {
- Map fullExtendedEventPileup = new HashMap();
+ Map fullExtendedEventPileup = new HashMap();
// get current location on the reference and decrement it by 1: the indels we just stepped over
// are associated with the *previous* reference base
- GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1);
+ GenomeLoc loc = genomeLocParser.incPos(getLocation(), -1);
boolean hasBeenSampled = false;
- for(final String sample: samples) {
+ for (final String sample : samples) {
Iterator iterator = readStates.iterator(sample);
List indelPile = new ArrayList(readStates.size(sample));
hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample);
@@ -368,103 +399,117 @@ private void lazyLoadNextAlignmentContext() {
nMQ0Reads = 0;
int maxDeletionLength = 0;
- while(iterator.hasNext()) {
- SAMRecordState state = iterator.next();
- if ( state.hadIndel() ) {
+ while (iterator.hasNext()) {
+ final SAMRecordState state = iterator.next();
+ final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
+ final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
+ final int readOffset = state.getReadOffset(); // the base offset on this read
+ final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
+ final int eventLength = state.getEventLength();
+
+ if (op == CigarOperator.N) // N's are never added to any pileup
+ continue;
+
+ if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
size++;
- if ( state.getEventBases() == null ) {
+ ExtendedEventPileupElement pileupElement;
+ if (state.getEventBases() == null) { // Deletion event
nDeletions++;
- maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength());
+ maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
+ pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
}
- else nInsertions++;
- indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadEventStartOffset(), state.getEventLength(), state.getEventBases()) );
-
- } else {
- // HACK: The readahead mechanism for LocusIteratorByState will effectively read past the current position
- // and add in extra reads that start after this indel. Skip these reads.
- // My belief at this moment after empirically looking at read->ref alignment is that, in a cigar string
- // like 1I76M, the first insertion is between alignment start-1 and alignment start, so we shouldn't be
- // filtering these out.
- // TODO: UPDATE! Eric tells me that we *might* want reads adjacent to the pileup in the pileup. Strike this block.
- //if(state.getRead().getAlignmentStart() > loc.getStart())
- // continue;
-
- if ( state.getCurrentCigarOperator() != CigarOperator.N ) {
- // this read has no indel associated with the previous position on the ref;
- // we count this read in only if it has actual bases, not N span...
- if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) {
-
- // if cigar operator is D but the read has no extended event reported (that's why we ended
- // up in this branch), it means that we are currently inside a deletion that started earlier;
- // we count such reads (with a longer deletion spanning over a deletion at the previous base we are
- // about to report) only if includeReadsWithDeletionAtLoci is true.
- size++;
- indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()-1, -1) // length=-1 --> noevent
- );
- }
+ else { // Insertion event
+ nInsertions++;
+ pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
}
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
+
+ indelPile.add(pileupElement);
}
- if ( state.getRead().getMappingQuality() == 0 ) {
- nMQ0Reads++;
+
+ // this read has no indel so add it to the pileup as a NOEVENT:
+ // a deletion that didn't start here (therefore, not an extended event)
+ // we add (mis)matches as no events.
+ else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) {
+ size++;
+ indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset));
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
}
}
- if( indelPile.size() != 0 ) fullExtendedEventPileup.put(sample,new ReadBackedExtendedEventPileupImpl(loc,indelPile,size,maxDeletionLength,nInsertions,nDeletions,nMQ0Reads));
+
+ if (indelPile.size() != 0)
+ fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
}
- hasExtendedEvents = false; // we are done with extended events prior to current ref base
-// System.out.println("Indel(s) at "+loc);
-// for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); }
+ hasExtendedEvents = false; // we are done with extended events prior to current ref base
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
- } else {
+ }
+ else { // this is a regular event pileup (not extended)
GenomeLoc location = getLocation();
- Map fullPileup = new HashMap();
-
+ Map fullPileup = new HashMap();
boolean hasBeenSampled = false;
- for(final String sample: samples) {
+ for (final String sample : samples) {
Iterator iterator = readStates.iterator(sample);
List pile = new ArrayList(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
- size = 0;
- nDeletions = 0;
- nMQ0Reads = 0;
+ size = 0; // number of elements in this sample's pileup
+ nDeletions = 0; // number of deletions in this sample's pileup
+ nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
- while(iterator.hasNext()) {
- SAMRecordState state = iterator.next();
- if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) {
- if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) {
- //discarded_bases++;
- //printStatus("Adaptor bases", discarded_adaptor_bases);
- continue;
- } else {
- //observed_bases++;
- pile.add(new PileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()));
+ while (iterator.hasNext()) {
+ final SAMRecordState state = iterator.next(); // state object with the read/offset information
+ final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
+ final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
+ final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
+ final CigarOperator nextOp = nextElement.getOperator();
+ final int readOffset = state.getReadOffset(); // the base offset on this read
+
+ int nextElementLength = nextElement.getLength();
+
+ if (op == CigarOperator.N) // N's are never added to any pileup
+ continue;
+
+ if (op == CigarOperator.D) {
+ if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
+ pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
+ null,nextOp == CigarOperator.D? nextElementLength:-1));
size++;
+ nDeletions++;
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
}
- } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) {
- size++;
- pile.add(new PileupElement((GATKSAMRecord) state.getRead(), -1));
- nDeletions++;
}
-
- if ( state.getRead().getMappingQuality() == 0 ) {
- nMQ0Reads++;
+ else {
+ if (!filterBaseInRead(read, location.getStart())) {
+ String insertedBaseString = null;
+ if (nextOp == CigarOperator.I) {
+ insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
+ }
+ pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
+ insertedBaseString,nextElementLength));
+ size++;
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
+ }
}
}
- if( pile.size() != 0 )
- fullPileup.put(sample,new ReadBackedPileupImpl(location,pile,size,nDeletions,nMQ0Reads));
+ if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
+ fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
}
- updateReadStates(); // critical - must be called after we get the current state offsets and location
- // if we got reads with non-D/N over the current position, we are done
- if ( !fullPileup.isEmpty() ) nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location,fullPileup),hasBeenSampled);
+ updateReadStates(); // critical - must be called after we get the current state offsets and location
+ if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
+ nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
}
}
}
// fast testing of position
private boolean readIsPastCurrentPosition(SAMRecord read) {
- if ( readStates.isEmpty() )
+ if (readStates.isEmpty())
return false;
else {
SAMRecordState state = readStates.getFirst();
@@ -485,20 +530,18 @@ private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
}
private void updateReadStates() {
- for(final String sample: samples) {
+ for (final String sample : samples) {
Iterator it = readStates.iterator(sample);
- while ( it.hasNext() ) {
+ while (it.hasNext()) {
SAMRecordState state = it.next();
CigarOperator op = state.stepForwardOnGenome();
- if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true;
- else {
+ if (state.hadIndel() && readInfo.generateExtendedEvents())
+ hasExtendedEvents = true;
+ else if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
- if ( op == null ) { // we've stepped off the end of the object
- //if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart()));
- it.remove();
- }
+ it.remove(); // we've stepped off the end of the object
}
}
}
@@ -508,20 +551,20 @@ public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
- private class ReadStateManager {
+ private class ReadStateManager {
private final PeekableIterator iterator;
private final DownsamplingMethod downsamplingMethod;
private final SamplePartitioner samplePartitioner;
- private final Map readStatesBySample = new HashMap();
+ private final Map readStatesBySample = new HashMap();
private final int targetCoverage;
private int totalReadStates = 0;
public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) {
this.iterator = new PeekableIterator(source);
this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE;
- switch(this.downsamplingMethod.type) {
+ switch (this.downsamplingMethod.type) {
case BY_SAMPLE:
- if(downsamplingMethod.toCoverage == null)
+ if (downsamplingMethod.toCoverage == null)
throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample");
this.targetCoverage = downsamplingMethod.toCoverage;
break;
@@ -529,10 +572,10 @@ public ReadStateManager(Iterator source, DownsamplingMethod downsampl
this.targetCoverage = Integer.MAX_VALUE;
}
- Map readSelectors = new HashMap();
- for(final String sample: samples) {
- readStatesBySample.put(sample,new PerSampleReadStateManager());
- readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector());
+ Map readSelectors = new HashMap();
+ for (final String sample : samples) {
+ readStatesBySample.put(sample, new PerSampleReadStateManager());
+ readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector());
}
samplePartitioner = new SamplePartitioner(readSelectors);
@@ -541,6 +584,7 @@ public ReadStateManager(Iterator source, DownsamplingMethod downsampl
/**
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
* for this iterator; if present, total read states will be decremented.
+ *
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
@@ -569,6 +613,7 @@ public boolean isEmpty() {
/**
* Retrieves the total number of reads in the manager across all samples.
+ *
* @return Total number of reads over all samples.
*/
public int size() {
@@ -577,6 +622,7 @@ public int size() {
/**
* Retrieves the total number of reads in the manager in the given sample.
+ *
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
@@ -587,6 +633,7 @@ public int size(final String sample) {
/**
* The extent of downsampling; basically, the furthest base out which has 'fallen
* victim' to the downsampler.
+ *
* @param sample Sample, downsampled independently.
* @return Integer stop of the furthest undownsampled region.
*/
@@ -595,9 +642,9 @@ public int getDownsamplingExtent(final String sample) {
}
public SAMRecordState getFirst() {
- for(final String sample: samples) {
+ for (final String sample : samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
- if(!reads.isEmpty())
+ if (!reads.isEmpty())
return reads.peek();
}
return null;
@@ -608,19 +655,18 @@ public boolean hasNext() {
}
public void collectPendingReads() {
- if(!iterator.hasNext())
+ if (!iterator.hasNext())
return;
- if(readStates.size() == 0) {
+ if (readStates.size() == 0) {
int firstContigIndex = iterator.peek().getReferenceIndex();
int firstAlignmentStart = iterator.peek().getAlignmentStart();
- while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
+ while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
samplePartitioner.submitRead(iterator.next());
}
- }
- else {
+ } else {
// Fast fail in the case that the read is past the current position.
- if(readIsPastCurrentPosition(iterator.peek()))
+ if (readIsPastCurrentPosition(iterator.peek()))
return;
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
@@ -629,7 +675,7 @@ public void collectPendingReads() {
}
samplePartitioner.complete();
- for(final String sample: samples) {
+ for (final String sample : samples) {
ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
Collection newReads = new ArrayList(aggregator.getSelectedReads());
@@ -638,21 +684,20 @@ public void collectPendingReads() {
int numReads = statesBySample.size();
int downsamplingExtent = aggregator.getDownsamplingExtent();
- if(numReads+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.NONE) {
+ if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) {
long readLimit = aggregator.getNumReadsSeen();
- addReadsToSample(statesBySample,newReads,readLimit);
+ addReadsToSample(statesBySample, newReads, readLimit);
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
- }
- else {
+ } else {
int[] counts = statesBySample.getCountsPerAlignmentStart();
int[] updatedCounts = new int[counts.length];
- System.arraycopy(counts,0,updatedCounts,0,counts.length);
+ System.arraycopy(counts, 0, updatedCounts, 0, counts.length);
boolean readPruned = true;
- while(numReads+newReads.size()>targetCoverage && readPruned) {
+ while (numReads + newReads.size() > targetCoverage && readPruned) {
readPruned = false;
- for(int alignmentStart=updatedCounts.length-1;numReads+newReads.size()>targetCoverage&&alignmentStart>=0;alignmentStart--) {
- if(updatedCounts[alignmentStart] > 1) {
+ for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) {
+ if (updatedCounts[alignmentStart] > 1) {
updatedCounts[alignmentStart]--;
numReads--;
readPruned = true;
@@ -660,7 +705,7 @@ public void collectPendingReads() {
}
}
- if(numReads == targetCoverage) {
+ if (numReads == targetCoverage) {
updatedCounts[0]--;
numReads--;
}
@@ -668,18 +713,18 @@ public void collectPendingReads() {
BitSet toPurge = new BitSet(readStates.size());
int readOffset = 0;
- for(int i = 0; i < updatedCounts.length; i++) {
+ for (int i = 0; i < updatedCounts.length; i++) {
int n = counts[i];
int k = updatedCounts[i];
- for(Integer purgedElement: MathUtils.sampleIndicesWithoutReplacement(n,n-k))
- toPurge.set(readOffset+purgedElement);
+ for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k))
+ toPurge.set(readOffset + purgedElement);
readOffset += counts[i];
}
- downsamplingExtent = Math.max(downsamplingExtent,statesBySample.purge(toPurge));
-
- addReadsToSample(statesBySample,newReads,targetCoverage-numReads);
+ downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge));
+
+ addReadsToSample(statesBySample, newReads, targetCoverage - numReads);
statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
}
}
@@ -688,23 +733,25 @@ public void collectPendingReads() {
/**
* Add reads with the given sample name to the given hanger entry.
+ *
* @param readStates The list of read states to add this collection of reads.
- * @param reads Reads to add. Selected reads will be pulled from this source.
- * @param maxReads Maximum number of reads to add.
+ * @param reads Reads to add. Selected reads will be pulled from this source.
+ * @param maxReads Maximum number of reads to add.
*/
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) {
- if(reads.isEmpty())
+ if (reads.isEmpty())
return;
Collection newReadStates = new LinkedList();
int readCount = 0;
- for(SAMRecord read: reads) {
- if(readCount < maxReads) {
+ for (SAMRecord read : reads) {
+ if (readCount < maxReads) {
SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents());
state.stepForwardOnGenome();
newReadStates.add(state);
// TODO: What if we downsample the extended events away?
- if (state.hadIndel()) hasExtendedEvents = true;
+ if (state.hadIndel())
+ hasExtendedEvents = true;
readCount++;
}
}
@@ -735,7 +782,7 @@ public int size() {
}
public void specifyNewDownsamplingExtent(int downsamplingExtent) {
- this.downsamplingExtent = Math.max(this.downsamplingExtent,downsamplingExtent);
+ this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent);
}
public int getDownsamplingExtent() {
@@ -745,7 +792,7 @@ public int getDownsamplingExtent() {
public int[] getCountsPerAlignmentStart() {
int[] counts = new int[readStateCounter.size()];
int index = 0;
- for(Counter counter: readStateCounter)
+ for (Counter counter : readStateCounter)
counts[index++] = counter.getCount();
return counts;
}
@@ -766,7 +813,7 @@ public void remove() {
wrappedIterator.remove();
Counter counter = readStateCounter.peek();
counter.decrement();
- if(counter.getCount() == 0)
+ if (counter.getCount() == 0)
readStateCounter.remove();
}
};
@@ -775,13 +822,14 @@ public void remove() {
/**
* Purge the given elements from the bitset. If an element in the bitset is true, purge
* the corresponding read state.
+ *
* @param elements bits from the set to purge.
* @return the extent of the final downsampled read.
*/
public int purge(final BitSet elements) {
int downsamplingExtent = 0;
- if(elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
+ if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
Iterator readStateIterator = readStates.iterator();
@@ -794,22 +842,22 @@ public int purge(final BitSet elements) {
int toPurge = elements.nextSetBit(0);
int removedCount = 0;
- while(readStateIterator.hasNext() && toPurge >= 0) {
+ while (readStateIterator.hasNext() && toPurge >= 0) {
SAMRecordState state = readStateIterator.next();
- downsamplingExtent = Math.max(downsamplingExtent,state.getRead().getAlignmentEnd());
+ downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd());
- if(readIndex == toPurge) {
+ if (readIndex == toPurge) {
readStateIterator.remove();
currentCounter.decrement();
- if(currentCounter.getCount() == 0)
+ if (currentCounter.getCount() == 0)
counterIterator.remove();
removedCount++;
- toPurge = elements.nextSetBit(toPurge+1);
+ toPurge = elements.nextSetBit(toPurge + 1);
}
readIndex++;
alignmentStartCounter--;
- if(alignmentStartCounter == 0 && counterIterator.hasNext()) {
+ if (alignmentStartCounter == 0 && counterIterator.hasNext()) {
currentCounter = counterIterator.next();
alignmentStartCounter = currentCounter.getCount();
}
@@ -849,12 +897,14 @@ public void decrement() {
interface ReadSelector {
/**
* All previous selectors in the chain have allowed this read. Submit it to this selector for consideration.
+ *
* @param read the read to evaluate.
*/
public void submitRead(SAMRecord read);
/**
* A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid.
+ *
* @param read the read previously rejected.
*/
public void notifyReadRejected(SAMRecord read);
@@ -866,12 +916,14 @@ interface ReadSelector {
/**
* Retrieve the number of reads seen by this selector so far.
+ *
* @return number of reads seen.
*/
public long getNumReadsSeen();
/**
* Return the number of reads accepted by this selector so far.
+ *
* @return number of reads selected.
*/
public long getNumReadsSelected();
@@ -880,12 +932,14 @@ interface ReadSelector {
* Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the
* last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at
* position 3 whose cigar string is 76M, the value of this parameter will be 78.
+ *
* @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0.
*/
public int getDownsamplingExtent();
/**
* Get the reads selected by this selector.
+ *
* @return collection of reads selected by this selector.
*/
public Collection getSelectedReads();
@@ -911,7 +965,7 @@ public void submitRead(SAMRecord read) {
public void notifyReadRejected(SAMRecord read) {
readsSeen++;
- downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd());
+ downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
}
public void complete() {
@@ -949,18 +1003,18 @@ class NRandomReadSelector implements ReadSelector {
private final ReservoirDownsampler reservoir;
private final ReadSelector chainedSelector;
private long readsSeen = 0;
- private int downsamplingExtent = 0;
+ private int downsamplingExtent = 0;
public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) {
- this.reservoir = new ReservoirDownsampler((int)readLimit);
+ this.reservoir = new ReservoirDownsampler((int) readLimit);
this.chainedSelector = chainedSelector;
}
public void submitRead(SAMRecord read) {
SAMRecord displaced = reservoir.add(read);
- if(displaced != null && chainedSelector != null) {
+ if (displaced != null && chainedSelector != null) {
chainedSelector.notifyReadRejected(read);
- downsamplingExtent = Math.max(downsamplingExtent,read.getAlignmentEnd());
+ downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
}
readsSeen++;
}
@@ -970,9 +1024,9 @@ public void notifyReadRejected(SAMRecord read) {
}
public void complete() {
- for(SAMRecord read: reservoir.getDownsampledContents())
+ for (SAMRecord read : reservoir.getDownsampledContents())
chainedSelector.submitRead(read);
- if(chainedSelector != null)
+ if (chainedSelector != null)
chainedSelector.complete();
}
@@ -987,7 +1041,7 @@ public long getNumReadsSelected() {
public int getDownsamplingExtent() {
return downsamplingExtent;
- }
+ }
public Collection getSelectedReads() {
return reservoir.getDownsampledContents();
@@ -996,7 +1050,7 @@ public Collection getSelectedReads() {
public void reset() {
reservoir.clear();
downsamplingExtent = 0;
- if(chainedSelector != null)
+ if (chainedSelector != null)
chainedSelector.reset();
}
}
@@ -1005,23 +1059,23 @@ public void reset() {
* Note: stores reads by sample ID string, not by sample object
*/
class SamplePartitioner implements ReadSelector {
- private final Map readsBySample;
+ private final Map readsBySample;
private long readsSeen = 0;
- public SamplePartitioner(Map readSelectors) {
+ public SamplePartitioner(Map readSelectors) {
readsBySample = readSelectors;
}
public void submitRead(SAMRecord read) {
- String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null;
- if(readsBySample.containsKey(sampleName))
+ String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
+ if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).submitRead(read);
readsSeen++;
}
public void notifyReadRejected(SAMRecord read) {
- String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null;
- if(readsBySample.containsKey(sampleName))
+ String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
+ if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).notifyReadRejected(read);
readsSeen++;
}
@@ -1040,23 +1094,23 @@ public long getNumReadsSelected() {
public int getDownsamplingExtent() {
int downsamplingExtent = 0;
- for(ReadSelector storage: readsBySample.values())
- downsamplingExtent = Math.max(downsamplingExtent,storage.getDownsamplingExtent());
+ for (ReadSelector storage : readsBySample.values())
+ downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent());
return downsamplingExtent;
}
-
+
public Collection getSelectedReads() {
throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner.");
}
public ReadSelector getSelectedReads(String sampleName) {
- if(!readsBySample.containsKey(sampleName))
+ if (!readsBySample.containsKey(sampleName))
throw new NoSuchElementException("Sample name not found");
return readsBySample.get(sampleName);
}
public void reset() {
- for(ReadSelector storage: readsBySample.values())
+ for (ReadSelector storage : readsBySample.values())
storage.reset();
readsSeen = 0;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index f098655376..bc7d5c6ca8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -90,19 +90,12 @@ public class GATKRunReport {
protected static Logger logger = Logger.getLogger(GATKRunReport.class);
- // the listing of the fields is somewhat important; this is the order that the simple XML will output them
- @ElementList(required = true, name = "gatk_header_Information")
- private List mGATKHeader;
-
@Element(required = false, name = "id")
private final String id;
@Element(required = false, name = "exception")
private final ExceptionToXML mException;
- @Element(required = true, name = "working_directory")
- private String currentPath;
-
@Element(required = true, name = "start_time")
private String startTime = "ND";
@@ -112,9 +105,6 @@ public class GATKRunReport {
@Element(required = true, name = "run_time")
private long runTime = 0;
- @Element(required = true, name = "command_line")
- private String cmdLine = "COULD NOT BE DETERMINED";
-
@Element(required = true, name = "walker_name")
private String walkerName;
@@ -127,9 +117,6 @@ public class GATKRunReport {
@Element(required = true, name = "max_memory")
private long maxMemory;
- @Element(required = true, name = "java_tmp_directory")
- private String tmpDir;
-
@Element(required = true, name = "user_name")
private String userName;
@@ -145,18 +132,13 @@ public class GATKRunReport {
@Element(required = true, name = "iterations")
private long nIterations;
- @Element(required = true, name = "reads")
- private long nReads;
-
public enum PhoneHomeOption {
/** Disable phone home */
NO_ET,
/** Standard option. Writes to local repository if it can be found, or S3 otherwise */
STANDARD,
/** Force output to STDOUT. For debugging only */
- STDOUT,
- /** Force output to S3. For debugging only */
- AWS_S3 // todo -- remove me -- really just for testing purposes
+ STDOUT
}
private static final DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss");
@@ -174,15 +156,8 @@ public GATKRunReport(Walker,?> walker, Exception e, GenomeAnalysisEngine engin
logger.debug("Aggregating data for run report");
- mGATKHeader = CommandLineGATK.createApplicationHeader();
- currentPath = System.getProperty("user.dir");
-
// what did we run?
id = org.apache.commons.lang.RandomStringUtils.randomAlphanumeric(32);
- try {
- cmdLine = engine.createApproximateCommandLineArgumentString(engine, walker);
- } catch (Exception ignore) { }
-
walkerName = engine.getWalkerName(walker.getClass());
svnVersion = CommandLineGATK.getVersionNumber();
@@ -193,7 +168,6 @@ public GATKRunReport(Walker,?> walker, Exception e, GenomeAnalysisEngine engin
startTime = dateFormat.format(engine.getStartTime());
runTime = (end.getTime() - engine.getStartTime().getTime()) / 1000L; // difference in seconds
}
- tmpDir = System.getProperty("java.io.tmpdir");
// deal with memory usage
Runtime.getRuntime().gc(); // call GC so totalMemory is ~ used memory
@@ -204,12 +178,11 @@ public GATKRunReport(Walker,?> walker, Exception e, GenomeAnalysisEngine engin
if ( engine.getCumulativeMetrics() != null ) {
// it's possible we aborted so early that these data structures arent initialized
nIterations = engine.getCumulativeMetrics().getNumIterations();
- nReads = engine.getCumulativeMetrics().getNumReadsSeen();
}
// user and hostname -- information about the runner of the GATK
userName = System.getProperty("user.name");
- hostName = "unknown"; // resolveHostname();
+ hostName = Utils.resolveHostname();
// basic java information
java = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version")));
@@ -239,11 +212,8 @@ public void postReport(PhoneHomeOption type) {
case STDOUT:
postReportToStream(System.out);
break;
- case AWS_S3:
- postReportToAWSS3();
- break;
default:
- exceptDuringRunReport("BUG: unexcepted PhoneHomeOption ");
+ exceptDuringRunReport("BUG: unexpected PhoneHomeOption ");
break;
}
}
@@ -264,22 +234,8 @@ private void postReportToStream(OutputStream stream) {
}
}
- /**
- * Opens the destination file and writes a gzipped version of the XML report there.
- *
- * @param destination
- * @throws IOException
- */
- private void postReportToFile(File destination) throws IOException {
- BufferedOutputStream out =
- new BufferedOutputStream(
- new GZIPOutputStream(
- new FileOutputStream(destination)));
- try {
- postReportToStream(out);
- } finally {
- out.close();
- }
+ private final String getKey() {
+ return getID() + ".report.xml.gz";
}
/**
@@ -288,16 +244,21 @@ private void postReportToFile(File destination) throws IOException {
* That is, postReport() is guarenteed not to fail for any reason.
*/
private File postReportToLocalDisk(File rootDir) {
- String filename = getID() + ".report.xml.gz";
- File file = new File(rootDir, filename);
+ final String filename = getKey();
+ final File destination = new File(rootDir, filename);
+
try {
- postReportToFile(file);
- logger.debug("Wrote report to " + file);
- return file;
+ final BufferedOutputStream out = new BufferedOutputStream(
+ new GZIPOutputStream(
+ new FileOutputStream(destination)));
+ postReportToStream(out);
+ out.close();
+ logger.debug("Wrote report to " + destination);
+ return destination;
} catch ( Exception e ) {
// we catch everything, and no matter what eat the error
exceptDuringRunReport("Couldn't read report file", e);
- file.delete();
+ destination.delete();
return null;
}
}
@@ -305,42 +266,46 @@ private File postReportToLocalDisk(File rootDir) {
private void postReportToAWSS3() {
// modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html
this.hostName = Utils.resolveHostname(); // we want to fill in the host name
- File localFile = postReportToLocalDisk(new File("./"));
- logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
- if ( localFile != null ) { // we succeeded in creating the local file
- localFile.deleteOnExit();
- try {
- // stop us from printing the annoying, and meaningless, mime types warning
- Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
- mimeTypeLogger.setLevel(Level.FATAL);
-
- // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
- // are stored in an AWSCredentials object:
-
- // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
- String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
- String awsSecretKey = "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"; // GATK AWS user
- AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
-
- // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
- // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
- S3Service s3Service = new RestS3Service(awsCredentials);
-
- // Create an S3Object based on a file, with Content-Length set automatically and
- // Content-Type set based on the file's extension (using the Mimetypes utility class)
- S3Object fileObject = new S3Object(localFile);
- //logger.info("Created S3Object" + fileObject);
- //logger.info("Uploading " + localFile + " to AWS bucket");
- S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
- logger.debug("Uploaded to AWS: " + s3Object);
- logger.info("Uploaded run statistics report to AWS S3");
- } catch ( S3ServiceException e ) {
- exceptDuringRunReport("S3 exception occurred", e);
- } catch ( NoSuchAlgorithmException e ) {
- exceptDuringRunReport("Couldn't calculate MD5", e);
- } catch ( IOException e ) {
- exceptDuringRunReport("Couldn't read report file", e);
- }
+ final String key = getKey();
+ logger.debug("Generating GATK report to AWS S3 with key " + key);
+ try {
+ // create an byte output stream so we can capture the output as a byte[]
+ final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096);
+ final OutputStream outputStream = new GZIPOutputStream(byteStream);
+ postReportToStream(outputStream);
+ outputStream.close();
+ final byte[] report = byteStream.toByteArray();
+
+ // stop us from printing the annoying, and meaningless, mime types warning
+ Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
+ mimeTypeLogger.setLevel(Level.FATAL);
+
+ // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
+ // are stored in an AWSCredentials object:
+
+ // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
+ String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
+ String awsSecretKey = "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"; // GATK AWS user
+ AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
+
+ // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
+ // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
+ S3Service s3Service = new RestS3Service(awsCredentials);
+
+ // Create an S3Object based on a file, with Content-Length set automatically and
+ // Content-Type set based on the file's extension (using the Mimetypes utility class)
+ S3Object fileObject = new S3Object(key, report);
+ //logger.info("Created S3Object" + fileObject);
+ //logger.info("Uploading " + localFile + " to AWS bucket");
+ S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
+ logger.debug("Uploaded to AWS: " + s3Object);
+ logger.info("Uploaded run statistics report to AWS S3");
+ } catch ( S3ServiceException e ) {
+ exceptDuringRunReport("S3 exception occurred", e);
+ } catch ( NoSuchAlgorithmException e ) {
+ exceptDuringRunReport("Couldn't calculate MD5", e);
+ } catch ( IOException e ) {
+ exceptDuringRunReport("Couldn't read report file", e);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
index b72b20e0b7..b59b550e1c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
@@ -296,6 +296,10 @@ public boolean containsKey(Object primaryKey) {
return primaryKeyColumn.contains(primaryKey);
}
+ public Collection