diff --git a/lib/edu/mit/broad/arachne/Alignment.java b/lib/edu/mit/broad/arachne/Alignment.java new file mode 100755 index 0000000000..52b7b59f20 --- /dev/null +++ b/lib/edu/mit/broad/arachne/Alignment.java @@ -0,0 +1,242 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.arachne; + + +/** + * This class represents an arachne LookAlign alignment (or other related data structures). + */ +public class Alignment { + + private static final char TAB = '\t'; + + private int mASequenceId; + private int mASequenceLength; + private int mAStart; + private int mAEnd; + private int mBSequenceId; + private int mBSequenceLength; + private int mBStart; + private int mBEnd; + private char mOrientation; + private int[] mAlignmentBlocks; + + + public Alignment() { + } + + public int getASequenceId() { + return mASequenceId; + } + + public void setASequenceId(int value) { + mASequenceId = value; + } + + public int getASequenceLength() { + return mASequenceLength; + } + + public void setASequenceLength(int value) { + mASequenceLength = value; + } + + public int getAStart() { + return mAStart; + } + + public void setAStart(int value) { + mAStart = value; + } + + public int getAEnd() { + return mAEnd; + } + + public void setAEnd(int value) { + mAEnd = value; + } + + public int getBSequenceId() { + return mBSequenceId; + } + + public void setBSequenceId(int value) { + mBSequenceId = value; + } + + public int getBSequenceLength() { + return mBSequenceLength; + } + + public void setBSequenceLength(int value) { + mBSequenceLength = value; + } + + public int getBStart() { + return mBStart; + } + + public void setBStart(int value) { + mBStart = value; + } + + public int getBEnd() { + return mBEnd; + } + + public void setBEnd(int value) { + mBEnd = value; + } + + public char getOrientation() { + return mOrientation; + } + + public void setOrientation(char value) { + mOrientation = value; + } + + public int[] getAlignmentBlocks() { + return mAlignmentBlocks; + } + + public void setAlignmentBlocks(int[] value) { + mAlignmentBlocks = value; + } + + public static Alignment parse(String text) { + + if (text == null) { + return null; + } + + String[] fields = text.trim().split("\t"); + if (fields.length == 0) { + return null; + } + + if (!fields[0].equals("QUERY")) { + throw new IllegalArgumentException("Invalid alignment: " + text); + } + if (fields.length < 14) { + throw new IllegalArgumentException("Invalid alignment: " + text); + } + + int seqAId = parseIntField(fields[1]); + int seqAStart = parseIntField(fields[2]); + int seqAEnd = parseIntField(fields[3]); + int seqALength = parseIntField(fields[4]); + int orientation = parseIntField(fields[5]); + int seqBId = parseIntField(fields[6]); + int seqBStart = parseIntField(fields[7]); + int seqBEnd = parseIntField(fields[8]); + int seqBLength = parseIntField(fields[9]); + int blockCount = parseIntField(fields[10]); + + if (seqAStart < 0 || seqAEnd <= 0 || seqALength <= 0 || + seqAStart >= seqALength || seqAEnd > seqALength || seqAStart >= seqAEnd) { + throw new IllegalArgumentException("Invalid alignment: " + text); + } + if (seqBStart < 0 || seqBEnd <= 0 || seqBLength <= 0 || + seqBStart >= seqBLength || seqBEnd > seqBLength || seqBStart >= seqBEnd) { + throw new IllegalArgumentException("Invalid alignment: " + text); + } + if (orientation < 0 || orientation > 1) { + throw new IllegalArgumentException("Invalid alignment: " + text); + } + if (fields.length != (11 + 3*blockCount)) { + throw new IllegalArgumentException("Invalid alignment: " + text); + } + + int[] alignmentBlocks = new int[3*blockCount]; + for (int i = 0; i < 3*blockCount; i++) { + alignmentBlocks[i] = parseIntField(fields[11 + i]); + } + + Alignment alignment = new Alignment(); + alignment.setASequenceId(seqAId); + alignment.setASequenceLength(seqALength); + alignment.setAStart(seqAStart+1); + alignment.setAEnd(seqAEnd); + alignment.setBSequenceId(seqBId); + alignment.setBSequenceLength(seqBLength); + alignment.setBStart(seqBStart+1); + alignment.setBEnd(seqBEnd); + alignment.setOrientation((orientation == 0) ? '+' : '-'); + alignment.setAlignmentBlocks(alignmentBlocks); + return alignment; + } + + private static int parseIntField(String text) { + try { + return Integer.parseInt(text); + } catch (NumberFormatException exc) { + throw new IllegalArgumentException("Illegal alignment field: " + text); + } + } + + public String arachneFormat() { + StringBuilder builder = new StringBuilder(); + builder.append("QUERY"); + builder.append(TAB); + builder.append(mASequenceId); + builder.append(TAB); + builder.append(mAStart-1); // zero based + builder.append(TAB); + builder.append(mAEnd); + builder.append(TAB); + builder.append(mASequenceLength); + builder.append(TAB); + builder.append(mOrientation == '+' ? 0 : 1); + builder.append(TAB); + builder.append(mBSequenceId); + builder.append(TAB); + builder.append(mBStart-1); // zero based + builder.append(TAB); + builder.append(mBEnd); + builder.append(TAB); + builder.append(mBSequenceLength); + builder.append(TAB); + builder.append(mAlignmentBlocks.length / 3); + for (int i = 0; i < mAlignmentBlocks.length; i++) { + builder.append(TAB); + builder.append(mAlignmentBlocks[i]); + } + return builder.toString(); + } + + public String format() { + StringBuilder builder = new StringBuilder(); + builder.append("Alignment"); + builder.append(' '); + builder.append(mASequenceId); + builder.append(' '); + builder.append(mAStart); + builder.append(' '); + builder.append(mAEnd); + builder.append(' '); + builder.append(mOrientation); + builder.append(' '); + builder.append(mBSequenceId); + builder.append(' '); + builder.append(mBStart); + builder.append(' '); + builder.append(mBEnd); + builder.append(' '); + builder.append(mAlignmentBlocks.length / 3); + for (int i = 0; i < mAlignmentBlocks.length; i++) { + builder.append(' '); + builder.append(mAlignmentBlocks[i]); + } + return builder.toString(); + } +} diff --git a/lib/edu/mit/broad/arachne/Fastb2Fasta.java b/lib/edu/mit/broad/arachne/Fastb2Fasta.java new file mode 100644 index 0000000000..964e054ef5 --- /dev/null +++ b/lib/edu/mit/broad/arachne/Fastb2Fasta.java @@ -0,0 +1,132 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.arachne; + +import java.io.*; + +/** + * Utility to convert fastb to fasta files. + * More importantly, can be used to extract a subset of the reads. + */ +public class Fastb2Fasta { + + private boolean mVerbose = false; + private boolean mDebug = false; + private String mInputPath = null; + private String mIdListFilePath = null; + + + public static void main(String[] args) + throws Exception { + new Fastb2Fasta().run(args); + } + + private void usage() { + System.out.println("Usage: Fastb2Fasta ... "); + System.out.println(" -idlist "); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-idlist") && argsleft > 1) { + argpos++; + mIdListFilePath = args[argpos++]; + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 1) { + usage(); + return false; + } + + mInputPath = args[argpos]; + return true; + } + + private void run(String[] args) + throws Exception { + + if (!parseArguments(args)) { + System.exit(1); + } + + FastbReader fastbReader = new FastbReader(new File(mInputPath)); + try { + if (mIdListFilePath != null) { + LineNumberReader reader = new LineNumberReader(new FileReader(mIdListFilePath)); + while (true) { + String line = reader.readLine(); + if (line == null) { + reader.close(); + break; + } + Integer id = parseReadId(line); + if (id == null) { + continue; + } + if (id < 0 || id >= fastbReader.getSequenceCount()) { + System.out.println("ERROR: Illegal sequence id: " + id); + System.exit(1); + } + String sequence = fastbReader.readSequence(id); + System.out.println(">" + id); + System.out.println(sequence); + } + } else { + int id = 0; + while (fastbReader.hasNext()) { + String sequence = fastbReader.next(); + System.out.println(">" + id); + System.out.println(sequence); + id++; + } + } + } finally { + fastbReader.close(); + } + } + + private Integer parseReadId(String line) { + String text = line.trim(); + if (text.length() == 0 || text.charAt(0) == '#') { + return null; + } + String token = text.split("\\s+")[0]; + Integer id = null; + try { + id = new Integer(token); + } catch (NumberFormatException exc) { + System.out.println("ERROR: Invalid sequence id: " + token); + System.exit(1); + } + return id; + } +} diff --git a/lib/edu/mit/broad/arachne/FastbReader.java b/lib/edu/mit/broad/arachne/FastbReader.java new file mode 100755 index 0000000000..0d6cd3dd5a --- /dev/null +++ b/lib/edu/mit/broad/arachne/FastbReader.java @@ -0,0 +1,220 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.arachne; + + +import edu.mit.broad.sam.util.CloseableIterator; + +import java.io.*; + + +/** + * Reader for arachne Fastb files. + */ +public class FastbReader + implements CloseableIterator { + + // Notes on fastb file format + // + // Fastb files contain the serialized contents of an arachne vecbasevector, + // which is a typedef for mastervec. + // The serialization of mastervec objects starts with a 24 byte mv_file_control_block, + // followed by N variable length segments (one for each element of the mastervec vector), + // followed by an offset table containing N 8-byte file offsets to the N variable length + // segments, followed by N fixed length data segments, one for each vector element. + // Thus, reading a single element of the mastervec vector requires reading from three + // separate places in the file (the offset table, the variable length section and the + // fixed length section). + // + // The mastervec file header is 24 bytes arranged as follows: + // n 4-byte signed(?) integer (number of entries) + // c1 1-byte unsigned bit mask (see below) + // reserved 1-byte unused + // sizeX 1-byte unsigned, sizeof first template parameter (16 for fastb files) + // sizeA 1-byte unsigned, sizeof second template parameter (4 for fastb files) + // offsets_start 8-byte signed(?) integer, file offset of offset table + // static_start 8-byte signed(?) integer, file offset of static data (fixed size section) + // + // For fastb files, the fixed size section contains 4 bytes for each object, which is the + // unsigned(?) count of the number of bases in this entry. + // For fastb files, the variable length section contains a bit vector with two bits per base. + // The bases are encoded as follows: A = 0, C = 1, G = 2, T = 3. + // + // For fastb files, in the file header N is the number of entries in the fastb file. + // c1 is unused/unimplemented except that the two low-order bits should be 0x01, indicating + // that we are using the single-file representation. There is also apparently a three-file + // representation that looks the same except that the offset table and static (fixed length) + // table are in separate files named .offsets and .static. + // The sizeX should be 16 for fastb files and sizeA should be 4. + // + // Note that in fastb files, the sequences are not identified by name or id, only by index + // (zero based) into the mastervec object. There is no representation for bases other than + // ACGT (i.e. Ns cannot be encoded). + + private static final char[] BASES = { 'A', 'C', 'G', 'T' }; + + private File mFile; + private RandomAccessFile mRandomFile; + private int mEntryCount; + private long mOffsetTableOffset; + private long mLengthTableOffset; + private int mCurrentPosition; + private byte[] mIOBuffer = new byte[8]; + + + public FastbReader(File file) + throws IOException { + mFile = file; + mRandomFile = new RandomAccessFile(mFile, "r"); + readHeader(); + } + + public int getSequenceCount() { + return mEntryCount; + } + + public boolean hasNext() { + return (mCurrentPosition < mEntryCount); + } + + public String next() { + if (!hasNext()) { + throw new IllegalStateException("Iterator exhausted"); + } + try { + return readSequence(mCurrentPosition); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + public void close() { + if (mRandomFile != null) { + mEntryCount = 0; + mCurrentPosition = 0; + try { + mRandomFile.close(); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } finally { + mRandomFile = null; + } + } + } + + public String readSequence(int n) + throws IOException { + if (mRandomFile == null) { + throw new IllegalStateException("Reader is closed"); + } + if (n < 0 || n >= mEntryCount) { + throw new IndexOutOfBoundsException("Illegal index: " + n); + } + long offset = getEntryOffset(n); + int length = getEntryBaseCount(n); + String result = readBases(offset, length); + mCurrentPosition = n+1; + return result; + } + + private void readHeader() + throws IOException { + + byte[] fileControlBlock = new byte[24]; + mRandomFile.readFully(fileControlBlock, 0, 24); + + int word2 = deserializeInt(fileControlBlock, 4); + int nFiles = word2 & 0x3; + int sizeX = (word2 >> 16) & 0xFF; + int sizeA = (word2 >> 24) & 0xFF; + if (nFiles != 1) { + throw new RuntimeException(mFile + ": Invalid file header: nFiles = " + nFiles); + } + if (sizeX != 16) { + throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeX); + } + if (sizeA != 4) { + throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeA); + } + mEntryCount = deserializeInt(fileControlBlock, 0); + mOffsetTableOffset = deserializeLong(fileControlBlock, 8); + mLengthTableOffset = deserializeLong(fileControlBlock, 16); + } + + private long getEntryOffset(int n) + throws IOException { + mRandomFile.seek(mOffsetTableOffset + 8 * n); + mRandomFile.readFully(mIOBuffer, 0, 8); + return deserializeLong(mIOBuffer, 0); + } + + private int getEntryBaseCount(int n) + throws IOException { + mRandomFile.seek(mLengthTableOffset + 4 * n); + mRandomFile.readFully(mIOBuffer, 0, 4); + return deserializeInt(mIOBuffer, 0); + } + + private String readBases(long fileOffset, int baseCount) + throws IOException { + + + int byteCount = (baseCount + 3) / 4; + byte[] data = new byte[byteCount]; + mRandomFile.seek(fileOffset); + mRandomFile.readFully(data, 0, byteCount); + + int baseIndex = 0; + int dataIndex = 0; + char[] baseBuffer = new char[baseCount]; + while (baseIndex < baseCount) { + int b = data[dataIndex++]; + int count = Math.min(4, baseCount - baseIndex); + for (int i = 0; i < count; i++) { + baseBuffer[baseIndex++] = BASES[b & 0x3]; + b = b >> 2; + } + } + return new String(baseBuffer); + } + + private int deserializeInt(byte[] buffer, int offset) { + int byte1 = buffer[offset] & 0xFF; + int byte2 = buffer[offset+1] & 0xFF; + int byte3 = buffer[offset+2] & 0xFF; + int byte4 = buffer[offset+3] & 0xFF; + return (byte1 | (byte2 << 8) | (byte3 << 16) | (byte4 << 24)); + } + + private long deserializeLong(byte[] buffer, int offset) { + long int1 = deserializeInt(buffer, offset) & 0xFFFFFFFFL; + long int2 = deserializeInt(buffer, offset+4) & 0xFFFFFFFFL; + return (int1 | (int2 << 32)); + } + + // Stub for interactive use (see also Fastb2Fasta) + public static void main(String[] args) + throws Exception { + FastbReader reader = new FastbReader(new File(args[0])); + int readId = 0; + while (reader.hasNext()) { + System.out.println(">" + readId); + System.out.println(reader.next()); + readId++; + } + reader.close(); + } +} + diff --git a/lib/edu/mit/broad/arachne/GenomeMask.java b/lib/edu/mit/broad/arachne/GenomeMask.java new file mode 100644 index 0000000000..7e7ebdcb0c --- /dev/null +++ b/lib/edu/mit/broad/arachne/GenomeMask.java @@ -0,0 +1,83 @@ +package edu.mit.broad.arachne; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.BitSet; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * Utility class to read in a set of contig-based genomic intervals in zero-based end inclusive + * and store them efficiently in memory as a 1-based bit-mask + */ +public class GenomeMask { + + // if memory usage becomes a problem... this could be changed to a SparseBitSet + // http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html + private SortedMap data = new TreeMap(); + + + public GenomeMask(File maskFile) throws IOException { + BufferedReader baitReader = null; + try { + baitReader = new BufferedReader(new FileReader(maskFile)); + String line; + while ((line = baitReader.readLine()) != null) { + String[] arr = line.split(" "); + int contig = Integer.parseInt(arr[0]); + + // covert the coordinates from 0-based, end inclusive to + // 1-based end inclusive + int startPos = Integer.parseInt(arr[1]) + 1; + int endPos = Integer.parseInt(arr[2]) + 1; + + BitSet bits = data.get(contig); + if (bits == null) { + bits = new BitSet(endPos); + data.put(contig,bits); + } + + bits.set(startPos, endPos + 1); // set method is end exclusive + } + } finally { + if (baitReader != null) { baitReader.close(); } + } + } + + /** + * This ctor is useful if initializing a GenomeMask externally. + */ + public GenomeMask() { + } + + public boolean get(int contig, int position) { + BitSet bits = data.get(contig); + return (bits != null) && bits.get(position); + } + + public BitSet get(int contig) { + return data.get(contig); + } + + /** + * Get an existing BitSet for the given contig, or create one if not already present. This is + * useful when initializing a GenomeMask from an external source. + * @param contig which BitSet + * @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size. + * @return the BitSet for the given contig, creating one if necessary + */ + public BitSet getOrCreate(int contig, int numBits) { + BitSet ret = data.get(contig); + if (ret == null) { + ret = new BitSet(numBits); + data.put(contig, ret); + } + return ret; + } + + public int getMaxContig() { + return data.lastKey(); + } +} diff --git a/lib/edu/mit/broad/arachne/LookAlignReader.java b/lib/edu/mit/broad/arachne/LookAlignReader.java new file mode 100755 index 0000000000..a00efcb7c9 --- /dev/null +++ b/lib/edu/mit/broad/arachne/LookAlignReader.java @@ -0,0 +1,136 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.arachne; + + +import edu.mit.broad.sam.util.CloseableIterator; + +import java.io.*; + + +/** + * Reader for arachne LookAlign text format alignment files. + * Supports filtering of the input by genomic locus. + */ +public class LookAlignReader + implements CloseableIterator { + + private LineNumberReader mReader = null; + private Alignment mNextAlignment = null; + private int mBSequenceId = -1; + private int mBStart = 0; + private int mBEnd = 0; + + + public LookAlignReader(File file) + throws IOException { + this(new FileReader(file)); + } + + public LookAlignReader(Reader reader) { + if (reader instanceof LineNumberReader) { + mReader = (LineNumberReader) reader; + } else { + mReader = new LineNumberReader(reader); + } + } + + public void setBSequenceId(int value) { + mBSequenceId = value; + } + + public void setBStart(int value) { + mBStart = value; + } + + public void setBEnd(int value) { + mBEnd = value; + } + + public boolean hasNext() { + if (mNextAlignment != null) { + return true; + } + try { + mNextAlignment = nextAlignment(); + return (mNextAlignment != null); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + public Alignment next() { + if (!hasNext()) { + throw new IllegalStateException("Iterator exhausted"); + } + try { + Alignment result = mNextAlignment; + mNextAlignment = nextAlignment(); + return result; + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + public void close() { + if (mReader != null) { + try { + mReader.close(); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + mReader = null; + } + } + + private Alignment nextAlignment() + throws IOException { + if (mReader == null) { + return null; + } + while (true) { + String line = mReader.readLine(); + if (line == null) { + close(); + break; + } + if (!line.startsWith("QUERY")) { + continue; + } + Alignment alignment = Alignment.parse(line); + if (matchesFilters(alignment)) { + return alignment; + } + } + return null; + } + + private boolean matchesFilters(Alignment alignment) { + if (mBSequenceId < 0) { + return true; + } + if (alignment.getBSequenceId() != mBSequenceId) { + return false; + } + if (mBStart > 0 && alignment.getBEnd() < mBStart) { + return false; + } + if (mBEnd > 0 && alignment.getBStart() > mBEnd) { + return false; + } + return true; + } +} + diff --git a/lib/edu/mit/broad/cnv/AnalyzeCnvs.java b/lib/edu/mit/broad/cnv/AnalyzeCnvs.java new file mode 100755 index 0000000000..07e9b79de6 --- /dev/null +++ b/lib/edu/mit/broad/cnv/AnalyzeCnvs.java @@ -0,0 +1,437 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv; + +import edu.mit.broad.arachne.Alignment; +import edu.mit.broad.arachne.LookAlignReader; + +import java.io.*; +import java.util.*; + + +/** + * Utility class to do data reduction on CNV data. + */ +public class AnalyzeCnvs { + + public static void main(String[] args) + throws Exception { + new AnalyzeCnvs().run(args); + } + + private void usage() { + System.out.println("Usage: AnalyzeCnvs ..."); + System.out.println(" -action "); + System.out.println(" -alignments or -"); + System.out.println(" -alignmentList "); + System.out.println(" -chromosome "); + System.out.println(" -start "); + System.out.println(" -end "); + System.out.println(" -bestAlignments"); + System.out.println(" -mismatchThreshold "); + System.out.println(" -binsize "); + System.out.println(" -output "); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-action") && argsleft > 1) { + argpos++; + mAction = args[argpos++]; + } else if (arg.equals("-alignments") && argsleft > 1) { + argpos++; + mAlignmentFilePath = args[argpos++]; + } else if (arg.equals("-alignmentList") && argsleft > 1) { + argpos++; + mAlignmentListFilePath = args[argpos++]; + } else if (arg.equals("-chromosome") && argsleft > 1) { + argpos++; + mChromosome = args[argpos++]; + } else if (arg.equals("-start") && argsleft > 1) { + argpos++; + mStartPosition = new Integer(args[argpos++]); + } else if (arg.equals("-end") && argsleft > 1) { + argpos++; + mEndPosition = new Integer(args[argpos++]); + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + } else if (arg.equals("-mismatchThreshold") && argsleft > 1) { + argpos++; + mMismatchThreshold = new Integer(args[argpos++]); + } else if (arg.equals("-bestAlignments")) { + argpos++; + mReturnBestHits = true; + } else if (arg.equals("-binsize") && argsleft > 1) { + argpos++; + mBinSize = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-output") && argsleft > 1) { + argpos++; + mOutputColumns = args[argpos++]; + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 0) { + usage(); + return false; + } + + return true; + } + + private void run(String[] args) + throws Exception { + + if (!parseArguments(args)) { + System.exit(1); + } + + if (mAction == null) { + mAction = "alignmentCoverage"; + } + + if (mAction.equals("alignmentCoverage")) { + mainAlignmentCoverage(); + } else { + System.out.println("Unknown action: " + mAction); + usage(); + System.exit(1); + } + } + + private void mainAlignmentCoverage() + throws IOException { + + if (mStartPosition == null || mEndPosition == null) { + usage(); + System.exit(1); + } else if (mStartPosition <= 0 || mEndPosition <= 0 || mStartPosition > mEndPosition) { + System.out.println("Invalid start/end positions: " + mStartPosition + " " + mEndPosition); + usage(); + System.exit(1); + } + + mSequenceId = chromosomeToSequenceId(mChromosome); + if (mSequenceId < 0) { + System.out.println("Invalid chromosome: " + mChromosome); + usage(); + System.exit(1); + } + + if (mBinSize <= 0) { + System.out.println("Invalid bin size: " + mBinSize); + usage(); + System.exit(1); + } + + runAlignmentCoverage(); + } + + private void runAlignmentCoverage() + throws IOException { + + int length = (mEndPosition - mStartPosition + 1); + if (length <= 0) { + throw new RuntimeException("Invalid start/end positions"); + } + + int binSize = mBinSize; + int binCount = (length + binSize - 1) / binSize; + int[] readStarts = new int[binCount]; + int[] readDepths = new int[binCount]; + List alignmentFiles = getAlignmentFiles(); + for (String path : alignmentFiles) { + processAlignmentFile(path, readStarts, readDepths); + } + printStats(readStarts, readDepths); + } + + private List getAlignmentFiles() + throws IOException { + List fileList = new ArrayList(); + if (mAlignmentListFilePath != null) { + LineNumberReader reader = new LineNumberReader(new FileReader(mAlignmentListFilePath)); + while (true) { + String line = reader.readLine(); + if (line == null) { + reader.close(); + break; + } + String path = line.trim(); + if (path.length() == 0 || path.startsWith("#")) { + continue; + } + fileList.add(path); + } + } else if (mAlignmentFilePath != null) { + fileList.add(mAlignmentFilePath); + } + return fileList; + } + + private void processAlignmentFile(String path, int[] readStarts, int[] readDepths) + throws IOException { + + LookAlignReader reader = null; + if (path == null || path.equals("-")) { + reader = new LookAlignReader(new InputStreamReader(System.in)); + } else { + reader = new LookAlignReader(new File(path)); + } + + while (true) { + Alignment alignment = getNextAlignment(reader); + if (alignment == null) { + reader.close(); + break; + } + processAlignment(alignment, readStarts, readDepths); + } + } + + private void processAlignment(Alignment alignment, + int[] readStarts, + int[] readDepths) { + + if (readStarts != null) { + int baseOffset = alignment.getBStart() - mStartPosition; + int binIndex = baseOffset / mBinSize; + if (binIndex >= 0 && binIndex < readStarts.length) { + readStarts[binIndex]++; + } + } + + if (readDepths != null) { + int baseOffset = alignment.getBStart() - mStartPosition; + int[] alignmentBlocks = alignment.getAlignmentBlocks(); + for (int i = 0; i < alignmentBlocks.length; i += 3) { + int gap = alignmentBlocks[i]; + int duration = alignmentBlocks[i+1]; + if (gap > 0) { + // Gap in B sequence (genome) + // Negative gaps are gaps in A sequence (read) + baseOffset += gap; + } + for (int j = 0; j < duration; j++) { + int binIndex = baseOffset / mBinSize; + if (binIndex >= 0 && binIndex < readDepths.length) { + readDepths[binIndex]++; + } + baseOffset++; + } + } + } + } + + private Alignment getNextAlignment(LookAlignReader reader) + throws IOException { + + if (!mReturnBestHits) { + while (reader.hasNext()) { + Alignment alignment = reader.next(); + if (passesAlignmentFilters(alignment)) { + return alignment; + } + } + return null; + } + + while (true) { + Alignment seed = mPendingAlignment; + mPendingAlignment = null; + if (seed == null && reader.hasNext()) { + seed = reader.next(); + } + if (seed == null) { + return null; + } + List secondaryHits = null; + while (reader.hasNext()) { + Alignment alignment = reader.next(); + if (alignment.getASequenceId() != seed.getASequenceId()) { + if (alignment.getASequenceId() < seed.getASequenceId()) { + throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); + } + mPendingAlignment = alignment; + break; + } + if (secondaryHits == null) { + secondaryHits = new ArrayList(); + } + secondaryHits.add(alignment); + } + if (secondaryHits == null) { + if (!passesAlignmentFilters(seed)) { + continue; + } + return seed; + } + secondaryHits.add(seed); + Alignment result = getUniqueBestAlignment(secondaryHits); + if (result != null && passesAlignmentFilters(result)) { + return result; + } + } + } + + private Alignment getUniqueBestAlignment(List alignments) { + int bestMismatches = 0; + List best = new ArrayList(); + for (Alignment a : alignments) { + int mismatches = getAlignmentMismatches(a); + if (best.isEmpty()) { + best.add(a); + bestMismatches = mismatches; + } + if (mismatches == bestMismatches) { + best.add(a); + } else if (mismatches < bestMismatches) { + best.clear(); + best.add(a); + bestMismatches = mismatches; + } + } + if (best.size() != 1) { + return null; + } + return best.get(0); + } + + private boolean passesAlignmentFilters(Alignment alignment) { + + if (mMismatchThreshold != null) { + if (getAlignmentMismatches(alignment) > mMismatchThreshold) { + return false; + } + } + + if (mSequenceId != null) { + if (alignment.getBSequenceId() != mSequenceId) { + return false; + } + } + + if (mStartPosition != null) { + if (alignment.getBEnd() < mStartPosition) { + return false; + } + } + + if (mEndPosition != null) { + if (alignment.getBStart() > mEndPosition) { + return false; + } + } + + return true; + } + + private int getAlignmentMismatches(Alignment alignment) { + int mismatches = 0; + int[] blocks = alignment.getAlignmentBlocks(); + for (int i = 0; i < blocks.length; i += 3) { + int gap = blocks[i]; + int duration = blocks[i+1]; + int mm = blocks[i+2]; + if (mm > duration) { + throw new RuntimeException("Invalid alignment? : " + alignment.format()); + } + mismatches += Math.abs(gap); + mismatches += mm; + } + return mismatches; + } + + private void printStats(int[] readStarts, int[] readDepths) { + if (mOutputColumns != null && mOutputColumns.equals("coverage")) { + // No headers, just coverage + for (int i = 0; i < readDepths.length; i++) { + String line = ""; + if (mBinSize == 1) { + line += readDepths[i]; + } else { + line += (readDepths[i] / (double) mBinSize); + } + System.out.println(line); + } + } else { + System.out.println("Position" + "\t" + "Starts" + "\t" + "Coverage"); + for (int i = 0; i < readDepths.length; i++) { + String line = ""; + int position = mStartPosition + i*mBinSize; + line += position + "\t" + readStarts[i] + "\t"; + if (mBinSize == 1) { + line += readDepths[i]; + } else { + line += (readDepths[i] / (double) mBinSize); + } + System.out.println(line); + } + } + } + + private int chromosomeToSequenceId(String text) { + if (text == null || text.length() == 0) { + return -1; + } + if (text.matches("\\d+")) { + return Integer.parseInt(text); + } + if (text.startsWith("chr") && text.length() > 3) { + text = text.substring(3); + } + if (text.matches("\\d+") && !text.startsWith("0")) { + return Integer.parseInt(text); + } + if (text.equals("M")) { + return 0; + } else if (text.equals("X")) { + return 23; + } else if (text.equals("Y")) { + return 24; + } else { + return -1; + } + } + + private boolean mDebug = false; + private boolean mVerbose = false; + + private String mAction = null; + private String mAlignmentFilePath = null; + private String mAlignmentListFilePath = null; + private String mChromosome = null; + private Integer mStartPosition = null; + private Integer mEndPosition = null; + private Integer mSequenceId = null; + private boolean mReturnBestHits = false; + private Integer mMismatchThreshold = null; + private int mBinSize = 1; + private String mOutputColumns = null; + private Alignment mPendingAlignment = null; +} diff --git a/lib/edu/mit/broad/cnv/CountAlignments.java b/lib/edu/mit/broad/cnv/CountAlignments.java new file mode 100644 index 0000000000..e0d60255d9 --- /dev/null +++ b/lib/edu/mit/broad/cnv/CountAlignments.java @@ -0,0 +1,283 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv; + +import edu.mit.broad.arachne.Alignment; +import edu.mit.broad.arachne.LookAlignReader; + +import java.io.*; +import java.util.*; + +/** + * Utility to count alignments (rather than gathering). + */ +public class CountAlignments { + + public static void main(String[] args) + throws Exception { + new CountAlignments().run(args); + } + + private void usage() { + System.out.println("Usage: CountAlignments ..."); + System.out.println(" -alignments (- for stdin)"); + System.out.println(" -chromosome "); + System.out.println(" -start "); + System.out.println(" -end "); + System.out.println(" -bestAlignments"); + System.out.println(" -mismatchThreshold "); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-alignments") && argsleft > 1) { + argpos++; + mAlignmentFilePath = args[argpos++]; + } else if (arg.equals("-mismatchThreshold") && argsleft > 1) { + argpos++; + mMismatchThreshold = new Integer(args[argpos++]); + } else if (arg.equals("-bestAlignments")) { + argpos++; + mReturnBestHits = true; + } else if (arg.equals("-chromosome") && argsleft > 1) { + argpos++; + String chromosome = args[argpos++]; + mSequenceId = chromosomeToSequenceId(chromosome); + if (mSequenceId < 0) { + System.out.println("Invalid chromosome: " + chromosome); + return false; + } + } else if (arg.equals("-start") && argsleft > 1) { + argpos++; + mStartPosition = new Integer(args[argpos++]); + } else if (arg.equals("-end") && argsleft > 1) { + argpos++; + mEndPosition = new Integer(args[argpos++]); + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 0) { + usage(); + return false; + } + + return true; + } + + private void run(String[] args) + throws Exception { + + if (!parseArguments(args)) { + System.exit(1); + } + + long[] counts = countAlignments(mAlignmentFilePath); + String line = counts[0] + " " + counts[1]; + if (mAlignmentFilePath != null) { + line = mAlignmentFilePath + " " + line; + } + System.out.println(line); + } + + private long[] countAlignments(String path) + throws IOException { + long alignmentCount = 0; + long baseCount = 0; + LookAlignReader reader = null; + if (path == null || path.equals("-")) { + reader = new LookAlignReader(new InputStreamReader(System.in)); + } else { + reader = new LookAlignReader(new File(path)); + } + while (true) { + Alignment alignment = getNextAlignment(reader); + if (alignment == null) { + reader.close(); + break; + } + if (mMismatchThreshold != null) { + if (getAlignmentMismatches(alignment) > mMismatchThreshold) { + continue; + } + } + if (mSequenceId != null) { + if (alignment.getBSequenceId() != mSequenceId) { + continue; + } + } + if (mStartPosition != null) { + if (alignment.getBEnd() < mStartPosition) { + continue; + } + } + if (mEndPosition != null) { + if (alignment.getBStart() > mEndPosition) { + continue; + } + } + alignmentCount++; + baseCount += getBaseCount(alignment); + } + long[] result = { alignmentCount, baseCount }; + return result; + } + + private Alignment getNextAlignment(LookAlignReader reader) + throws IOException { + if (!mReturnBestHits) { + if (!reader.hasNext()) { + return null; + } + return reader.next(); + } + while (true) { + Alignment seed = mPendingAlignment; + mPendingAlignment = null; + if (seed == null && reader.hasNext()) { + seed = reader.next(); + } + if (seed == null) { + return null; + } + List secondaryHits = null; + while (reader.hasNext()) { + Alignment alignment = reader.next(); + if (alignment.getASequenceId() != seed.getASequenceId()) { + if (alignment.getASequenceId() < seed.getASequenceId()) { + throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); + } + mPendingAlignment = alignment; + break; + } + if (secondaryHits == null) { + secondaryHits = new ArrayList(); + } + secondaryHits.add(alignment); + } + if (secondaryHits == null) { + return seed; + } + secondaryHits.add(seed); + Alignment result = getUniqueBestAlignment(secondaryHits); + if (result != null) { + return result; + } + } + } + + private Alignment getUniqueBestAlignment(List alignments) { + int bestMismatches = 0; + List best = new ArrayList(); + for (Alignment a : alignments) { + int mismatches = getAlignmentMismatches(a); + if (best.isEmpty()) { + best.add(a); + bestMismatches = mismatches; + } + if (mismatches == bestMismatches) { + best.add(a); + } else if (mismatches < bestMismatches) { + best.clear(); + best.add(a); + bestMismatches = mismatches; + } + } + if (best.size() != 1) { + return null; + } + return best.get(0); + } + + private int getAlignmentMismatches(Alignment alignment) { + int mismatches = 0; + int[] blocks = alignment.getAlignmentBlocks(); + for (int i = 0; i < blocks.length; i += 3) { + int gap = blocks[i]; + int duration = blocks[i+1]; + int mm = blocks[i+2]; + if (mm > duration) { + throw new RuntimeException("Invalid alignment? : " + alignment.format()); + } + mismatches += Math.abs(gap); + mismatches += mm; + } + return mismatches; + } + + // Return the number of reference bases covered by this alignment. + private int getBaseCount(Alignment alignment) { + int count = 0; + int[] blocks = alignment.getAlignmentBlocks(); + for (int i = 0; i < blocks.length; i += 3) { + // int gap = blocks[i]; + int duration = blocks[i+1]; + // int mm = blocks[i+2]; + count += duration; + } + return count; + } + + private int chromosomeToSequenceId(String text) { + if (text == null || text.length() == 0) { + return -1; + } + if (text.matches("\\d+")) { + return Integer.parseInt(text); + } + if (text.startsWith("chr") && text.length() > 3) { + text = text.substring(3); + } + if (text.matches("\\d+") && !text.startsWith("0")) { + return Integer.parseInt(text); + } + if (text.equals("M")) { + return 0; + } else if (text.equals("X")) { + return 23; + } else if (text.equals("Y")) { + return 24; + } else { + return -1; + } + } + + + private boolean mDebug = false; + private boolean mVerbose = false; + + private String mAlignmentFilePath = null; + private boolean mReturnBestHits = false; + private Integer mMismatchThreshold = null; + private Integer mSequenceId = null; + private Integer mStartPosition = null; + private Integer mEndPosition = null; + private Alignment mPendingAlignment = null; +} diff --git a/lib/edu/mit/broad/cnv/CountKMers.java b/lib/edu/mit/broad/cnv/CountKMers.java new file mode 100644 index 0000000000..0fa159615f --- /dev/null +++ b/lib/edu/mit/broad/cnv/CountKMers.java @@ -0,0 +1,1301 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv; + +import java.io.*; +import java.util.*; + + +/** + * Tool for counting unique kmers. + */ +public class CountKMers +{ + private static final int NONUNIQUE_MARKER = -1; + private static boolean mUseOldFormat = false; + + private String mAction = null; + private static int mK = 0; + private int mBatchSize = 0; + private List mInputFiles = null; + private File mInputDirectory = null; + private File mOutputDirectory = null; + private boolean mVerbose = false; + private boolean mDebug = false; + + private List mSequenceList = null; + private List mSequenceOffsetList = null; + private List mSpillFileList = null; + private double mSpillFactor = 0.9; + + private long mKMerCount = 0; + private long mUniquePriorCount = 0; + private long mUniqueNewCount = 0; + private long mPriorMapUniqueCount = 0; + + private InputStream mPriorMapStream = null; + private int mPriorMapPosition = -1; + private int mPriorMapValue = 0; + private int mInputFileIndex = 0; + private LineNumberReader mCurrentReader = null; + private String mNextSequence = null; + private char[] mKMerBuffer = null; + private int mKMerBufferedCount = 0; + private String mLineBuffer = null; + private int mLineBufferIndex = 0; + private int mBaseIndex = -1; + private byte[] mIOBuffer = null; + + /* Design + Inputs: + - One or more fasta files to search (currently one). + - Output directory for the result files. + - Optionally an input k-1-mer file (output from previous pass). + Outputs: + - Unique kmer file: (sorted by kmer) + This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). + - Per chromosome bit map: pos (implicit) new-bit cum-bit + New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. + Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. + - Statistics + Plan: + - Reducing memory footprint is crucial. + - Sequential pass over the input sequences to generate kmers. + - BatchSize kmers are cached in memory, then sorted and uniqified. + - As batch array fills, batches are spilled to disk. + - Batches are reloaded from disk and merged (N-finger algorithm) + - and streamed to a merge file. + - Merge file is read from disk and processed as final results. + */ + + public static void main(String[] args) + throws Exception { + new CountKMers().run(args); + } + + private void usage() { + System.out.println("Usage: CountKMers ..."); + System.out.println(" -action "); + System.out.println(" -genome "); + System.out.println(" -k "); + System.out.println(" -batchSize "); + System.out.println(" -inputDir "); + System.out.println(" -outputDir "); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-action") && argsleft > 1) { + argpos++; + mAction = args[argpos++]; + } else if (arg.equals("-genome") && argsleft > 1) { + argpos++; + if (mInputFiles == null) { + mInputFiles = new ArrayList(); + } + mInputFiles.add(new File(args[argpos++])); + } else if (arg.equals("-k") && argsleft > 1) { + argpos++; + mK = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-batchSize") && argsleft > 1) { + argpos++; + mBatchSize = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-inputDir") && argsleft > 1) { + argpos++; + mInputDirectory = new File(args[argpos++]); + } else if (arg.equals("-outputDir") && argsleft > 1) { + argpos++; + mOutputDirectory = new File(args[argpos++]); + } else if (arg.equals("-oldFormat")) { + argpos++; + mUseOldFormat = true; + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 0) { + usage(); + return false; + } + + return true; + } + + private void run(String[] args) + throws Exception { + if (!parseArguments(args)) { + System.exit(1); + } + if (mAction == null || mAction.equals("mapKMers")) { + mapKMers(); + } else if (mAction.equals("mapGaps")) { + mapGaps(); + } + } + + // Can be used to scan genome for sequence names/lengths. + private void scanKMers() + throws IOException { + mSequenceList = new ArrayList(); + mSequenceOffsetList = new ArrayList(); + File priorMapFile = + new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); + openPriorMap(priorMapFile); + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + mSequenceList.add(seqName); + mSequenceOffsetList.add(mBaseIndex+1); + log("Scanning " + seqName + " ..."); + while (true) { + char[] kmerChars = getNextKMer(); + if (kmerChars == null) { + break; + } + mKMerCount++; + if (isUniqueInPriorMap(mBaseIndex)) { + continue; + } + } + } + closePriorMap(); + } + + private void mapGaps() + throws IOException { + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + int pos = 0; + int gapStart = 0; + while (true) { + char base = getNextBase(); + if (base == 0) { + break; + } + pos++; + if (base == 'N') { + if (gapStart == 0) { + gapStart = pos; + } + } else { + if (gapStart > 0) { + System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); + gapStart = 0; + } + } + } + if (gapStart > 0) { + System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); + gapStart = 0; + } + } + } + + private void mapKMers() + throws IOException { + + File textKMerFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); + File binaryKMerFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); + File exceptionFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); + File mapFile = + new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); + File priorMapFile = + new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); + File statsFile = + new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); + + if (mBatchSize == 0) { + throw new RuntimeException("Batch size not specified"); + } + + int kmerCount = 0; + int batchSize = mBatchSize; + KMerPosition[] kmerArray = new KMerPosition[batchSize]; + List exceptionList = new ArrayList(); + mSequenceList = new ArrayList(); + mSequenceOffsetList = new ArrayList(); + mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; + + openPriorMap(priorMapFile); + + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + mSequenceList.add(seqName); + mSequenceOffsetList.add(mBaseIndex+1); + log("Processing " + seqName + " ..."); + while (true) { + char[] kmerChars = getNextKMer(); + if (kmerChars == null) { + break; + } + mKMerCount++; + int baseIndex = mBaseIndex; + if (isUniqueInPriorMap(baseIndex)) { + mUniquePriorCount++; + continue; + } + KMerPosition kmp = encodeKMer(kmerChars, baseIndex); + if (kmp == null) { + String kmer = new String(kmerChars); + exceptionList.add(new StringKMerPosition(kmer, baseIndex)); + continue; + } + kmerArray[kmerCount++] = kmp; + if (kmerCount == batchSize) { + kmerCount = compactKMers(kmerArray, kmerCount); + if (kmerCount > mSpillFactor * batchSize) { + spillKMers(kmerArray, kmerCount); + kmerCount = 0; + } + } + } + } + if (kmerCount > 0) { + kmerCount = compactKMers(kmerArray, kmerCount); + if (mSpillFileList != null) { + spillKMers(kmerArray, kmerCount); + kmerCount = 0; + } + } + + closePriorMap(); + + // Write out the exception kmers (text file). + compactKMers(exceptionList); + writeExceptionFile(exceptionList, exceptionFile); + + // Write out the binary file of unique encoded kmers. + if (mSpillFileList == null) { + kmerCount = removeNonUnique(kmerArray, kmerCount); + writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); + mUniqueNewCount = kmerCount; + } else { + mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); + } + mUniqueNewCount += countUniqueKMers(exceptionList); + + // Write out the text file of (all) unique kmers. + writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); + + // Create map file from prior map plus the new unique kmers. + int mapSize = ((mBaseIndex >> 2) & 0x3FFFFFFF) + 1; + createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); + + // Write summary statistics file. + writeSummaryStatistics(statsFile); + } + + private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { + if (kmerCount == 0) { + return 0; + } + log("Compacting " + kmerCount + " kmers at index " + + Integer.toHexString(mBaseIndex) + " ..."); + Arrays.sort(kmerArray, 0, kmerCount); + int newCount = 1; + KMerPosition current = kmerArray[0]; + for (int i = 1; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + if (current.compareTo(kmp) == 0) { + current.setBaseIndex(NONUNIQUE_MARKER); + } else { + kmerArray[newCount++] = kmp; + current = kmp; + } + } + log("Compaction finished, new count is " + newCount); + return newCount; + } + + private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { + if (kmerCount == 0) { + return 0; + } + log("Compacting " + kmerCount + " string kmers ..."); + Arrays.sort(kmerArray, 0, kmerCount); + int newCount = 1; + String kmerString = kmerArray[0].getKMer(); + for (int i = 1; i < kmerCount; i++) { + StringKMerPosition kmp = kmerArray[i]; + String ks = kmp.getKMer(); + if (ks.equals(kmerString)) { + kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); + } else { + kmerArray[newCount++] = kmp; + kmerString = ks; + } + } + log("Compaction finished, new count is " + newCount); + return newCount; + } + + private void compactKMers(List kmerList) { + int kmerCount = kmerList.size(); + if (kmerCount <= 1) { + return; + } + StringKMerPosition[] kmerArray = + kmerList.toArray(new StringKMerPosition[kmerCount]); + kmerCount = compactKMers(kmerArray, kmerCount); + kmerList.clear(); + for (int i = 0; i < kmerCount; i++) { + kmerList.add(kmerArray[i]); + } + } + + private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { + int uniqueCount = 0; + for (int i = 0; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { + kmerArray[uniqueCount++] = kmp; + } + } + return uniqueCount; + } + + private int countUniqueKMers(List kmerList) { + int uniqueCount = 0; + for (StringKMerPosition kmp : kmerList) { + if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { + uniqueCount++; + } + } + return uniqueCount; + } + + private void spillKMers(KMerPosition[] kmerArray, int kmerCount) + throws IOException { + if (mSpillFileList == null) { + mSpillFileList = new ArrayList(); + } + int fileNumber = mSpillFileList.size() + 1; + log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); + File spillFile = new File(mOutputDirectory, + "spill_" + mK + "_" + fileNumber + ".tmp"); + mSpillFileList.add(spillFile); + writeKMerBinaryFile(kmerArray, kmerCount, spillFile); + log("Spill file written"); + } + + private void writeKMerBinaryFile(KMerPosition[] kmerArray, + int kmerCount, + File outputFile) + throws IOException { + OutputStream outputStream = + new BufferedOutputStream(new FileOutputStream(outputFile)); + for (int i = 0; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + writeKMerPosition(outputStream, kmerArray[i]); + } + outputStream.flush(); + outputStream.close(); + } + + private void writeExceptionFile(List kmerList, + File outputFile) + throws IOException { + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + for (StringKMerPosition kmer : kmerList) { + writeUniqueKMer(kmer, writer); + } + writer.flush(); + writer.close(); + } + + private KMerPosition readKMerPosition(InputStream stream) + throws IOException { + if (mUseOldFormat) { + return readKMerPositionOldFormat(stream); + } + byte[] buffer = mIOBuffer; + int encodingLength = (mK + 7)/8; + int fileLength = 4 + 2*encodingLength; + int count = readFully(stream, buffer, 0, fileLength); + if (count <= 0) { + return null; + } else if (count != fileLength) { + throw new RuntimeException("Unexpected end of file"); + } + char[] encoding = new char[encodingLength]; + int baseIndex = ((buffer[0] & 0xFF) | + (buffer[1] & 0xFF) << 8 | + (buffer[2] & 0xFF) << 16 | + (buffer[3] & 0xFF) << 24); + for (int i = 0; i < encodingLength; i++) { + encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | + ((buffer[2*i+5] & 0xFF) << 8)); + } + return new KMerPositionN(encoding, baseIndex); + } + + private KMerPosition readKMerPositionOldFormat(InputStream stream) + throws IOException { + byte[] buffer = mIOBuffer; + int length = (mK >= 32 ? 20 : 12); + int count = readFully(stream, buffer, 0, length); + if (count <= 0) { + return null; + } else if (count != length) { + throw new RuntimeException("Unexpected end of file"); + } + long encoding = (((long)(buffer[0] & 0xFF)) | + ((long)(buffer[1] & 0xFF)) << 8 | + ((long)(buffer[2] & 0xFF)) << 16 | + ((long)(buffer[3] & 0xFF)) << 24 | + ((long)(buffer[4] & 0xFF)) << 32 | + ((long)(buffer[5] & 0xFF)) << 40 | + ((long)(buffer[6] & 0xFF)) << 48 | + ((long)(buffer[7] & 0xFF)) << 56); + int baseIndex = ((buffer[length-4] & 0xFF) | + (buffer[length-3] & 0xFF) << 8 | + (buffer[length-2] & 0xFF) << 16 | + (buffer[length-1] & 0xFF) << 24); + if (length == 12) { + return new KMerPosition1(encoding, baseIndex); + } else { + long encoding2 = (((long)(buffer[8] & 0xFF)) | + ((long)(buffer[9] & 0xFF)) << 8 | + ((long)(buffer[10] & 0xFF)) << 16 | + ((long)(buffer[11] & 0xFF)) << 24 | + ((long)(buffer[12] & 0xFF)) << 32 | + ((long)(buffer[13] & 0xFF)) << 40 | + ((long)(buffer[14] & 0xFF)) << 48 | + ((long)(buffer[15] & 0xFF)) << 56); + return new KMerPosition2(encoding, encoding2, baseIndex); + } + } + + private int readFully(InputStream stream, byte[] buffer, int offset, int count) + throws IOException { + int readCount = 0; + while (readCount < count) { + int read = stream.read(buffer, offset, count-readCount); + if (read <= 0) { + break; + } + offset += read; + readCount += read; + } + return readCount; + } + + private void writeKMerPosition(OutputStream stream, KMerPosition kmer) + throws IOException { + if (mUseOldFormat) { + writeKMerPositionOldFormat(stream, kmer); + return; + } + byte[] buffer = mIOBuffer; + int baseIndex = kmer.getBaseIndex(); + char[] encoding = kmer.getKMerEncoding(); + int offset = 0; + buffer[offset++] = (byte) ((baseIndex) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); + for (int i = 0; i < encoding.length; i++) { + buffer[offset++] = (byte) ((encoding[i]) & 0xFF); + buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); + } + stream.write(buffer, 0, offset); + } + + private void writeKMerPositionOldFormat(OutputStream stream, KMerPosition kmer) + throws IOException { + byte[] buffer = mIOBuffer; + long encoding1 = kmer.getKMerEncoding1(); + long encoding2 = kmer.getKMerEncoding2(); + int baseIndex = kmer.getBaseIndex(); + int offset = 0; + buffer[offset++] = (byte) ((encoding1) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 8) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 16) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 24) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 32) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 40) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 48) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 56) & 0xFF); + if (mK >= 32) { + buffer[offset++] = (byte) ((encoding2) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 8) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 16) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 24) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 32) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 40) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 48) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 56) & 0xFF); + } + buffer[offset++] = (byte) ((baseIndex) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); + stream.write(buffer, 0, offset); + } + + private long mergeSpillFiles(List spillFiles, File outputFile) + throws IOException { + + if (spillFiles == null) { + return 0; + } + + log("Merging spill files ..."); + OutputStream outputStream = + new BufferedOutputStream(new FileOutputStream(outputFile)); + long uniqueCount = 0; + int fileCount = spillFiles.size(); + InputStream[] inputStreams = new InputStream[fileCount]; + KMerPosition[] kmers = new KMerPosition[fileCount]; + for (int i = 0; i < fileCount; i++) { + inputStreams[i] = + new BufferedInputStream(new FileInputStream(spillFiles.get(i))); + } + while (true) { + for (int i = 0; i < fileCount; i++) { + if (kmers[i] == null && inputStreams[i] != null) { + kmers[i] = readKMerPosition(inputStreams[i]); + if (kmers[i] == null) { + inputStreams[i].close(); + inputStreams[i] = null; + } + } + } + int count = 0; + KMerPosition kmer = null; + for (int i = 0; i < fileCount; i++) { + KMerPosition kmp = kmers[i]; + if (kmp == null) { + continue; + } else if (kmer == null) { + kmer = kmp; + count = 1; + } else { + int cmp = kmp.compareTo(kmer); + if (cmp == 0) { + count++; + } else if (cmp < 0) { + kmer = kmp; + count = 1; + } + } + } + if (kmer == null) { + break; + } + for (int i = 0; i < fileCount; i++) { + if (kmers[i] == kmer) { + kmers[i] = null; + } + } + if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { + uniqueCount++; + writeKMerPosition(outputStream, kmer); + } + } + outputStream.flush(); + outputStream.close(); + for (int i = 0; i < fileCount; i++) { + // spillFiles.get(i).delete(); + } + log("Spill files merged, unique count is " + uniqueCount); + return uniqueCount; + } + + private void writeKMerTextFile(File inputFile, + List exceptionList, + File outputFile) + throws IOException { + + log("Writing kmer file " + outputFile + " ..."); + int exceptionIndex = 0; + StringKMerPosition excKMer = null; + Iterator excIter = null; + if (!exceptionList.isEmpty()) { + excIter = exceptionList.iterator(); + excKMer = excIter.next(); + } + + InputStream inputStream = + new BufferedInputStream(new FileInputStream(inputFile)); + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + KMerPosition kmer = readKMerPosition(inputStream); + while (kmer != null || excKMer != null) { + if (excKMer == null) { + writeUniqueKMer(kmer, writer); + kmer = readKMerPosition(inputStream); + } else if (kmer == null) { + writeUniqueKMer(excKMer, writer); + excKMer = excIter.hasNext() ? excIter.next() : null; + } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { + writeUniqueKMer(kmer, writer); + kmer = readKMerPosition(inputStream); + } else { + writeUniqueKMer(excKMer, writer); + excKMer = excIter.hasNext() ? excIter.next() : null; + } + } + inputStream.close(); + writer.flush(); + writer.close(); + log("Wrote kmer file: " + outputFile); + } + + private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { + if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { + writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); + } + } + + private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { + if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { + writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); + } + } + + private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { + String chr = getBaseIndexSequenceName(baseIndex); + int pos = getBaseIndexCoordinate(baseIndex); + writer.println(kmer + "\t" + chr + "\t" + pos); + } + + private void createMapFile(int mapSize, + File kmerFile, + List exceptionList, + File priorMapFile, + File mapFile) + throws IOException { + byte[] map = null; + long uniquePriorCount = 0; + if (priorMapFile.exists()) { + map = readMapFile(priorMapFile); + if (map.length != mapSize) { + throw new RuntimeException("Prior map is wrong size"); + } + // Clear the new bits from prior map. + // Also count the prior unique positions while we are at it. + // Note that this is a count of positions, not kmers. + for (int i = 0; i < mapSize; i++) { + int cumBits = map[i] & 0x55; + uniquePriorCount += Integer.bitCount(cumBits); + map[i] = (byte) cumBits; + } + } else { + map = new byte[mapSize]; + } + for (StringKMerPosition kmp : exceptionList) { + addToMap(kmp, map); + } + mPriorMapUniqueCount = uniquePriorCount; + + InputStream inputStream = + new BufferedInputStream(new FileInputStream(kmerFile)); + while (true) { + KMerPosition kmp = readKMerPosition(inputStream); + if (kmp == null) { + inputStream.close(); + break; + } + addToMap(kmp, map); + } + + long testCum = 0; + for (int i = 0; i < map.length; i++) { + testCum += Integer.bitCount(map[i] & 0x55); + } + + writeMapFile(map, mapFile); + } + + private void addToMap(KMerPosition kmp, byte[] map) { + int baseIndex = kmp.getBaseIndex(); + if (baseIndex != NONUNIQUE_MARKER) { + addToMap(baseIndex, map); + } + } + + private void addToMap(StringKMerPosition kmp, byte[] map) { + int baseIndex = kmp.getBaseIndex(); + if (baseIndex != NONUNIQUE_MARKER) { + addToMap(baseIndex, map); + } + } + + private void addToMap(int baseIndex, byte[] map) { + int mod = baseIndex & 0x3; + int offset = (baseIndex >> 2) & 0x3FFFFFFF; + if (((map[offset] >> (2*mod)) & 0x3) != 0) { + throw new RuntimeException("Map entry already set: " + baseIndex); + } + map[offset] |= (0x3 << (2*mod)); + } + + private void writeSummaryStatistics(File outputFile) + throws IOException { + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; + long uniqueCount = mUniquePriorCount + mUniqueNewCount; + long nonUniqueCount = mKMerCount - uniqueCount; + writer.println("K: " + mK); + writer.println("Sequences: " + mSequenceList.size()); + writer.println("Bases: " + baseCount); + writer.println("KMers: " + mKMerCount); + writer.println("Prior map count: " + mPriorMapUniqueCount); + writer.println("Unique prior: " + mUniquePriorCount + + " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); + writer.println("Unique new: " + mUniqueNewCount + + " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); + writer.println("Unique cumulative: " + uniqueCount + + " (" + formatPercent(uniqueCount, mKMerCount) + ")"); + writer.println("Nonunique: " + nonUniqueCount + + " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); + writer.flush(); + writer.close(); + } + + private String formatPercent(long numerator, long denominator) { + double fraction = 0.0; + if (denominator != 0) { + fraction = numerator / (double) denominator; + } + return String.format("%1.1f%%", fraction * 100.0); + } + + private void openPriorMap(File mapFile) + throws IOException { + if (mapFile.exists()) { + mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); + mPriorMapPosition = -1; + mPriorMapValue = 0; + } + } + + private void closePriorMap() + throws IOException { + if (mPriorMapStream != null) { + mPriorMapStream.close(); + } + mPriorMapStream = null; + mPriorMapPosition = -1; + mPriorMapValue = 0; + } + + private byte[] readMapFile(File file) + throws IOException { + long fileLength = file.length(); + if (fileLength > 1000000000) { + throw new RuntimeException("Prior map too large: " + file); + } + int length = (int) fileLength; + byte[] map = new byte[length]; + FileInputStream stream = new FileInputStream(file); + int count = readFully(stream, map, 0, length); + if (count != length) { + throw new RuntimeException("Failed to read map: " + file); + } + stream.close(); + return map; + } + + private void writeMapFile(byte[] map, File file) + throws IOException { + FileOutputStream stream = new FileOutputStream(file); + stream.write(map); + stream.flush(); + stream.close(); + } + + private boolean isUniqueInPriorMap(int baseIndex) + throws IOException { + if (mPriorMapStream == null) { + return false; + } + int byteOffset = (baseIndex >> 2) & 0x3FFFFFFF; + if (byteOffset != mPriorMapPosition) { + int delta = byteOffset - mPriorMapPosition; + if (delta < 0) { + throw new RuntimeException("Attempt to seek backwards in prior map"); + } + if (delta > 1) { + skipFully(mPriorMapStream, delta-1); + } + mPriorMapValue = mPriorMapStream.read(); + if (mPriorMapValue < 0) { + throw new RuntimeException("Unexpected end of file in prior map"); + } + mPriorMapPosition += delta; + } + int mod = baseIndex & 0x3; + return (((mPriorMapValue >> (2*mod)) & 1) != 0); + } + + private void skipFully(InputStream stream, long amount) + throws IOException { + while (amount > 0) { + long skip = stream.skip(amount); + if (skip <= 0 || skip > amount) { + throw new RuntimeException("Skip failed"); + } + amount -= skip; + } + } + + private String getBaseIndexSequenceName(int baseIndex) { + int sequenceCount = mSequenceList.size(); + for (int i = 0; i < sequenceCount-1; i++) { + int nextOffset = mSequenceOffsetList.get(i+1); + if (compareBaseIndex(nextOffset, baseIndex) > 0) { + return mSequenceList.get(i); + } + } + return mSequenceList.get(sequenceCount-1); + } + + private int getBaseIndexCoordinate(int baseIndex) { + Integer sequenceOffset = null; + for (Integer offset : mSequenceOffsetList) { + if (compareBaseIndex(offset, baseIndex) > 0) { + break; + } + sequenceOffset = offset; + } + if (sequenceOffset == null) { + return 0; + } + int coordinate = baseIndex - sequenceOffset + 1; + if (coordinate <= 0) { + dumpSequenceList(); + System.out.println("coordinate: " + coordinate); + System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); + System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); + throw new RuntimeException("Internal error: illegal coordinate " + + coordinate + " for base index " + baseIndex); + } + return coordinate; + } + + private void dumpSequenceList() { + System.out.println("# Sequences:"); + int count = mSequenceList.size(); + for (int i = 0; i < count; i++) { + String seqName = mSequenceList.get(i); + int offset = mSequenceOffsetList.get(i); + System.out.println("# " + seqName + + "\t" + offset + + "\t" + Integer.toHexString(offset)); + } + } + + private int compareBaseIndex(int baseIndex1, int baseIndex2) { + // Implements unsigned comparison, a la compareTo + if (baseIndex1 < 0 ^ baseIndex2 < 0) { + return ((baseIndex1 < 0) ? 1 : -1); + } else { + return (baseIndex1 - baseIndex2); + } + } + + private String getNextSequence() + throws IOException { + + while (mNextSequence == null) { + if (mCurrentReader == null) { + mCurrentReader = getNextReader(); + if (mCurrentReader == null) { + return null; + } + } + String line = mCurrentReader.readLine(); + if (line == null) { + mCurrentReader.close(); + mCurrentReader = null; + continue; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + } + } + String result = mNextSequence; + mNextSequence = null; + return result; + } + + private LineNumberReader getNextReader() + throws IOException { + if (mInputFileIndex >= mInputFiles.size()) { + return null; + } + File file = mInputFiles.get(mInputFileIndex++); + return new LineNumberReader(new FileReader(file)); + } + + private char[] getNextKMer() + throws IOException { + + if (mKMerBuffer == null) { + mKMerBuffer = new char[mK]; + } + System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); + if (mKMerBufferedCount > 0) { + mKMerBufferedCount--; + } + + while (mKMerBufferedCount < mK) { + char base = getNextBase(); + if (base == 0) { + incrementBaseIndex(mKMerBufferedCount); + mKMerBufferedCount = 0; + return null; + } else if (base == 'N') { + incrementBaseIndex(mKMerBufferedCount+1); + mKMerBufferedCount = 0; + } else { + mKMerBuffer[mKMerBufferedCount++] = base; + } + } + incrementBaseIndex(1); + return mKMerBuffer; + } + + private char getNextBase() + throws IOException { + + if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { + if (mCurrentReader == null) { + return 0; + } + String line = mCurrentReader.readLine(); + if (line == null) { + mLineBuffer = null; + mLineBufferIndex = 0; + mCurrentReader.close(); + mCurrentReader = null; + return 0; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + mLineBuffer = null; + mLineBufferIndex = 0; + return 0; + } + mLineBuffer = line.toUpperCase(); + mLineBufferIndex = 0; + } + return mLineBuffer.charAt(mLineBufferIndex++); + } + + private void incrementBaseIndex(int amount) { + if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { + throw new RuntimeException("Base index: 32-bit overflow"); + } + mBaseIndex += amount; + } + + private void log(String text) { + if (mVerbose) { + System.out.println("# " + new Date() + " " + text); + } + } + + private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { + if (mUseOldFormat) { + return encodeKMerOldFormat(kmerChars, baseIndex); + } + if (kmerChars == null) { + return null; + } + int kmerLength = kmerChars.length; + int encodingLength = (kmerLength + 7) / 8; + char[] encoding = new char[encodingLength]; + int offset = kmerLength % 8; + offset = (offset == 0) ? 8 : offset; + int bits = encodeKMerBits(kmerChars, 0, offset); + if (bits < 0) { + return null; + } + encoding[0] = (char) bits; + for (int i = 1; i < encodingLength; i++) { + bits = encodeKMerBits(kmerChars, offset, 8); + if (bits < 0) { + return null; + } + encoding[i] = (char) bits; + offset += 8; + } + return new KMerPositionN(encoding, baseIndex); + } + + private static KMerPosition encodeKMerOldFormat(char[] kmerChars, int baseIndex) { + if (kmerChars == null) { + return null; + } + int length = kmerChars.length; + if (length <= 31) { + long bits = encodeKMerBitsLong(kmerChars, 0, length); + if (bits == -1) { + return null; + } + return new KMerPosition1(bits, baseIndex); + } else if (length <= 62) { + long bits1 = encodeKMerBitsLong(kmerChars, 0, 31); + long bits2 = encodeKMerBitsLong(kmerChars, 31, length - 31); + if (bits1 == -1 || bits2 == -1) { + return null; + } + return new KMerPosition2(bits1, bits2, baseIndex); + } else { + return null; + } + } + + private static int encodeKMerBits(char[] kmerChars, int offset, int length) { + int bits = 0; + for (int i = 0; i < length; i++) { + char base = kmerChars[offset + i]; + int baseBits = "ACGT".indexOf(base); + if (baseBits < 0) { + return -1; + } + bits |= baseBits << (2*(length-i-1)); + } + return bits; + } + + private static long encodeKMerBitsLong(char[] kmerChars, int offset, int length) { + long bits = 0; + for (int i = 0; i < length; i++) { + char base = kmerChars[offset + i]; + int baseBits = "ACGT".indexOf(base); + if (baseBits < 0) { + return -1; + } + bits |= ((long)baseBits) << (2*(length-i-1)); + } + return bits; + } + + private static String decodeKMer1(long bits) { + int length = mK; + char[] buffer = new char[length]; + decodeKMerBits(bits, buffer, 0, length); + return new String(buffer); + } + + private static String decodeKMer2(long bits1, long bits2) { + int length = mK; + char[] buffer = new char[length]; + decodeKMerBits(bits1, buffer, 0, 31); + decodeKMerBits(bits2, buffer, 31, length-31); + return new String(buffer); + } + + private static String decodeKMerN(char[] encoding) { + int length = mK; + char[] buffer = new char[length]; + int offset = length % 8; + offset = (offset == 0) ? 8 : offset; + decodeKMerBits(encoding[0], buffer, 0, offset); + for (int i = 1; i < encoding.length; i++) { + decodeKMerBits(encoding[i], buffer, offset, 8); + offset += 8; + } + return new String(buffer); + } + + private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { + for (int i = 0; i < length; i++) { + int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); + buffer[offset + i] = "ACGT".charAt(baseBits); + } + } + + private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { + for (int i = 0; i < length; i++) { + int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); + buffer[offset + i] = "ACGT".charAt(baseBits); + } + } + + static class KMerPosition + implements Comparable { + + private int mBaseIndex; + + KMerPosition(int baseIndex) { + mBaseIndex = baseIndex; + } + + public String getKMer() { + return null; + } + + public long getKMerEncoding1() { + return -1; + } + + public long getKMerEncoding2() { + return -1; + } + + public final int getBaseIndex() { + return mBaseIndex; + } + + public final void setBaseIndex(int baseIndex) { + mBaseIndex = baseIndex; + } + + public char[] getKMerEncoding() { + return null; + } + + public int compareTo(KMerPosition kmp) { + char[] encoding1 = getKMerEncoding(); + char[] encoding2 = kmp.getKMerEncoding(); + int length = Math.max(encoding1.length, encoding2.length); + for (int i = 0; i < length; i++) { + int result = encoding1[i] - encoding2[i]; + if (result != 0) { + return result; + } + } + return 0; + } + } + + static class KMerPosition1 + extends KMerPosition { + + private long mKMerEncoding1; + + KMerPosition1(long kmer, int baseIndex) { + super(baseIndex); + mKMerEncoding1 = kmer; + } + + public String getKMer() { + return decodeKMer1(getKMerEncoding1()); + } + + public final long getKMerEncoding1() { + return mKMerEncoding1; + } + + public int compareTo(KMerPosition kmp) { + int result = Long.signum(getKMerEncoding1() - kmp.getKMerEncoding1()); + if (result == 0) { + result = Long.signum(getKMerEncoding2() - kmp.getKMerEncoding2()); + } + return result; + } + } + + static class KMerPosition2 + extends KMerPosition1 { + + private long mKMerEncoding2; + + KMerPosition2(long encoding1, long encoding2, int baseIndex) { + super(encoding1, baseIndex); + mKMerEncoding2 = encoding2; + } + + public String getKMer() { + return decodeKMer2(getKMerEncoding1(), getKMerEncoding2()); + } + + public final long getKMerEncoding2() { + return mKMerEncoding2; + } + } + + static class KMerPositionN + extends KMerPosition { + + private char[] mKMerEncoding; + + KMerPositionN(char[] encoding, int baseIndex) { + super(baseIndex); + mKMerEncoding = encoding; + } + + public String getKMer() { + return decodeKMerN(mKMerEncoding); + } + + public final char[] getKMerEncoding() { + return mKMerEncoding; + } + } + + static class StringKMerPosition + implements Comparable { + + private String mKMerString = null; + private int mBaseIndex; + + StringKMerPosition(String kmer, int baseIndex) { + mKMerString = kmer; + mBaseIndex = baseIndex; + } + + public final String getKMer() { + return mKMerString; + } + + public final int getBaseIndex() { + return mBaseIndex; + } + + public final void setBaseIndex(int baseIndex) { + mBaseIndex = baseIndex; + } + + public int compareTo(StringKMerPosition kmp) { + return mKMerString.compareTo(kmp.mKMerString); + } + } +} diff --git a/lib/edu/mit/broad/cnv/CountKMers3.java b/lib/edu/mit/broad/cnv/CountKMers3.java new file mode 100644 index 0000000000..81ddb17452 --- /dev/null +++ b/lib/edu/mit/broad/cnv/CountKMers3.java @@ -0,0 +1,1426 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv; + +import java.io.*; +import java.util.*; + + +/** + * Tool for counting unique kmers. + */ +public class CountKMers3 +{ + private static final int NONUNIQUE_MARKER = -1; + private static boolean mUseOldFormat = false; + + private String mAction = null; + private static int mK = 0; + private int mBatchSize = 0; + private List mInputFiles = null; + private File mInputDirectory = null; + private File mOutputDirectory = null; + private boolean mVerbose = false; + private boolean mDebug = false; + + private List mSequenceList = null; + private List mSequenceOffsetList = null; + private List mSpillFileList = null; + private double mSpillFactor = 0.9; + + private long mKMerCount = 0; + private long mUniquePriorCount = 0; + private long mUniqueNewCount = 0; + private long mPriorMapUniqueCount = 0; + + private InputStream mPriorMapStream = null; + private int mPriorMapPosition = -1; + private int mPriorMapValue = 0; + private int mInputFileIndex = 0; + private LineNumberReader mCurrentReader = null; + private String mNextSequence = null; + private char[] mKMerBuffer = null; + private int mKMerBufferedCount = 0; + private String mLineBuffer = null; + private int mLineBufferIndex = 0; + private int mBaseIndex = -1; + private byte[] mIOBuffer = null; + + /* Design + Inputs: + - One or more fasta files to search (currently one). + - Output directory for the result files. + - Optionally an input k-1-mer file (output from previous pass). + Outputs: + - Unique kmer file: (sorted by kmer) + This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). + - Per chromosome bit map: pos (implicit) new-bit cum-bit + New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. + Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. + - Statistics + Plan: + - Reducing memory footprint is crucial. + - Sequential pass over the input sequences to generate kmers. + - BatchSize kmers are cached in memory, then sorted and uniqified. + - As batch array fills, batches are spilled to disk. + - Batches are reloaded from disk and merged (N-finger algorithm) + - and streamed to a merge file. + - Merge file is read from disk and processed as final results. + */ + + public static void main(String[] args) + throws Exception { + new CountKMers3().run(args); + } + + private void usage() { + System.out.println("Usage: CountKMers ..."); + System.out.println(" -action "); + System.out.println(" -genome "); + System.out.println(" -k "); + System.out.println(" -batchSize "); + System.out.println(" -inputDir "); + System.out.println(" -outputDir "); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-action") && argsleft > 1) { + argpos++; + mAction = args[argpos++]; + } else if (arg.equals("-genome") && argsleft > 1) { + argpos++; + if (mInputFiles == null) { + mInputFiles = new ArrayList(); + } + mInputFiles.add(new File(args[argpos++])); + } else if (arg.equals("-k") && argsleft > 1) { + argpos++; + mK = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-batchSize") && argsleft > 1) { + argpos++; + mBatchSize = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-inputDir") && argsleft > 1) { + argpos++; + mInputDirectory = new File(args[argpos++]); + } else if (arg.equals("-outputDir") && argsleft > 1) { + argpos++; + mOutputDirectory = new File(args[argpos++]); + } else if (arg.equals("-oldFormat")) { + argpos++; + mUseOldFormat = true; + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 0) { + usage(); + return false; + } + + return true; + } + + private void run(String[] args) + throws Exception { + if (!parseArguments(args)) { + System.exit(1); + } + if (mAction == null || mAction.equals("mapKMers")) { + mapKMers(); + } else if (mAction.equals("mapGaps")) { + mapGaps(); + } + } + + // Can be used to scan genome for sequence names/lengths. + private void scanKMers() + throws IOException { + mSequenceList = new ArrayList(); + mSequenceOffsetList = new ArrayList(); + File priorMapFile = + new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); + openPriorMap(priorMapFile); + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + mSequenceList.add(seqName); + mSequenceOffsetList.add(mBaseIndex+1); + log("Scanning " + seqName + " ..."); + while (true) { + char[] kmerChars = getNextKMer(); + if (kmerChars == null) { + break; + } + mKMerCount++; + if (isUniqueInPriorMap(mBaseIndex)) { + continue; + } + } + } + closePriorMap(); + } + + private void mapGaps() + throws IOException { + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + int pos = 0; + int gapStart = 0; + while (true) { + char base = getNextBase(); + if (base == 0) { + break; + } + pos++; + if (base == 'N') { + if (gapStart == 0) { + gapStart = pos; + } + } else { + if (gapStart > 0) { + System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); + gapStart = 0; + } + } + } + if (gapStart > 0) { + System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); + gapStart = 0; + } + } + } + + private void mapKMers() + throws IOException { + + File textKMerFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); + File binaryKMerFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); + File exceptionFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); + File mapFile = + new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); + File priorMapFile = + new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); + File statsFile = + new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); + + if (mBatchSize == 0) { + throw new RuntimeException("Batch size not specified"); + } + + int kmerCount = 0; + int batchSize = mBatchSize; + KMerPosition[] kmerArray = new KMerPosition[batchSize]; + List exceptionList = new ArrayList(); + mSequenceList = new ArrayList(); + mSequenceOffsetList = new ArrayList(); + mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; + + openPriorMap(priorMapFile); + + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + mSequenceList.add(seqName); + mSequenceOffsetList.add(mBaseIndex+1); + log("Processing " + seqName + " ..."); + while (true) { + char[] kmerChars = getNextKMer(); + if (kmerChars == null) { + break; + } + mKMerCount++; + int baseIndex = mBaseIndex; + if (isUniqueInPriorMap(baseIndex)) { + mUniquePriorCount++; + continue; + } + + KMerPosition kmp = encodeKMer(kmerChars, baseIndex); + if (kmp == null) { + // Note: We currently do not handle the reverse + // complement of exception characters correctly. + // For hg18, however, this doesn't matter as + // none of the kmers containing non-ACGT characters + // are present on the reverse strand. + String kmer = new String(kmerChars); + exceptionList.add(new StringKMerPosition(kmer, baseIndex)); + continue; + } + kmerArray[kmerCount++] = kmp; + if (kmerCount == batchSize) { + kmerCount = compactKMers(kmerArray, kmerCount); + if (kmerCount > mSpillFactor * batchSize) { + spillKMers(kmerArray, kmerCount); + kmerCount = 0; + } + } + } + } + if (kmerCount > 0) { + kmerCount = compactKMers(kmerArray, kmerCount); + if (mSpillFileList != null) { + spillKMers(kmerArray, kmerCount); + kmerCount = 0; + } + } + + closePriorMap(); + + // Write out the exception kmers (text file). + compactKMers(exceptionList); + writeExceptionFile(exceptionList, exceptionFile); + + // Write out the binary file of unique encoded kmers. + if (mSpillFileList == null) { + kmerCount = removeNonUnique(kmerArray, kmerCount); + writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); + mUniqueNewCount = kmerCount; + } else { + mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); + } + mUniqueNewCount += countUniqueKMers(exceptionList); + + // Write out the text file of (all) unique kmers. + writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); + + // Create map file from prior map plus the new unique kmers. + int mapSize = ((mBaseIndex >> 2) & 0x3FFFFFFF) + 1; + createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); + + // Write summary statistics file. + writeSummaryStatistics(statsFile); + } + + private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { + if (kmerCount == 0) { + return 0; + } + log("Compacting " + kmerCount + " kmers at index " + + Integer.toHexString(mBaseIndex) + " ..."); + Arrays.sort(kmerArray, 0, kmerCount); + int newCount = 1; + KMerPosition current = kmerArray[0]; + for (int i = 1; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + if (current.compareTo(kmp) == 0) { + current.setBaseIndex(NONUNIQUE_MARKER); + } else { + kmerArray[newCount++] = kmp; + current = kmp; + } + } + log("Compaction finished, new count is " + newCount); + return newCount; + } + + private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { + if (kmerCount == 0) { + return 0; + } + log("Compacting " + kmerCount + " string kmers ..."); + Arrays.sort(kmerArray, 0, kmerCount); + int newCount = 1; + String kmerString = kmerArray[0].getKMer(); + for (int i = 1; i < kmerCount; i++) { + StringKMerPosition kmp = kmerArray[i]; + String ks = kmp.getKMer(); + if (ks.equals(kmerString)) { + kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); + } else { + kmerArray[newCount++] = kmp; + kmerString = ks; + } + } + log("Compaction finished, new count is " + newCount); + return newCount; + } + + private void compactKMers(List kmerList) { + int kmerCount = kmerList.size(); + if (kmerCount <= 1) { + return; + } + StringKMerPosition[] kmerArray = + kmerList.toArray(new StringKMerPosition[kmerCount]); + kmerCount = compactKMers(kmerArray, kmerCount); + kmerList.clear(); + for (int i = 0; i < kmerCount; i++) { + kmerList.add(kmerArray[i]); + } + } + + private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { + int uniqueCount = 0; + for (int i = 0; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { + kmerArray[uniqueCount++] = kmp; + } + } + return uniqueCount; + } + + private int countUniqueKMers(List kmerList) { + int uniqueCount = 0; + for (StringKMerPosition kmp : kmerList) { + if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { + uniqueCount++; + } + } + return uniqueCount; + } + + private void spillKMers(KMerPosition[] kmerArray, int kmerCount) + throws IOException { + if (mSpillFileList == null) { + mSpillFileList = new ArrayList(); + } + int fileNumber = mSpillFileList.size() + 1; + log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); + File spillFile = new File(mOutputDirectory, + "spill_" + mK + "_" + fileNumber + ".tmp"); + mSpillFileList.add(spillFile); + writeKMerBinaryFile(kmerArray, kmerCount, spillFile); + log("Spill file written"); + } + + private void writeKMerBinaryFile(KMerPosition[] kmerArray, + int kmerCount, + File outputFile) + throws IOException { + OutputStream outputStream = + new BufferedOutputStream(new FileOutputStream(outputFile)); + for (int i = 0; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + writeKMerPosition(outputStream, kmerArray[i]); + } + outputStream.flush(); + outputStream.close(); + } + + private void writeExceptionFile(List kmerList, + File outputFile) + throws IOException { + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + for (StringKMerPosition kmer : kmerList) { + writeUniqueKMer(kmer, writer); + } + writer.flush(); + writer.close(); + } + + private KMerPosition readKMerPosition(InputStream stream) + throws IOException { + if (mUseOldFormat) { + return readKMerPositionOldFormat(stream); + } + byte[] buffer = mIOBuffer; + int encodingLength = (mK + 7)/8; + int fileLength = 4 + 2*encodingLength; + int count = readFully(stream, buffer, 0, fileLength); + if (count <= 0) { + return null; + } else if (count != fileLength) { + throw new RuntimeException("Unexpected end of file"); + } + char[] encoding = new char[encodingLength]; + int baseIndex = ((buffer[0] & 0xFF) | + (buffer[1] & 0xFF) << 8 | + (buffer[2] & 0xFF) << 16 | + (buffer[3] & 0xFF) << 24); + for (int i = 0; i < encodingLength; i++) { + encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | + ((buffer[2*i+5] & 0xFF) << 8)); + } + return new KMerPositionN(encoding, baseIndex); + } + + private KMerPosition readKMerPositionOldFormat(InputStream stream) + throws IOException { + byte[] buffer = mIOBuffer; + int length = (mK >= 32 ? 20 : 12); + int count = readFully(stream, buffer, 0, length); + if (count <= 0) { + return null; + } else if (count != length) { + throw new RuntimeException("Unexpected end of file"); + } + long encoding = (((long)(buffer[0] & 0xFF)) | + ((long)(buffer[1] & 0xFF)) << 8 | + ((long)(buffer[2] & 0xFF)) << 16 | + ((long)(buffer[3] & 0xFF)) << 24 | + ((long)(buffer[4] & 0xFF)) << 32 | + ((long)(buffer[5] & 0xFF)) << 40 | + ((long)(buffer[6] & 0xFF)) << 48 | + ((long)(buffer[7] & 0xFF)) << 56); + int baseIndex = ((buffer[length-4] & 0xFF) | + (buffer[length-3] & 0xFF) << 8 | + (buffer[length-2] & 0xFF) << 16 | + (buffer[length-1] & 0xFF) << 24); + if (length == 12) { + return new KMerPosition1(encoding, baseIndex); + } else { + long encoding2 = (((long)(buffer[8] & 0xFF)) | + ((long)(buffer[9] & 0xFF)) << 8 | + ((long)(buffer[10] & 0xFF)) << 16 | + ((long)(buffer[11] & 0xFF)) << 24 | + ((long)(buffer[12] & 0xFF)) << 32 | + ((long)(buffer[13] & 0xFF)) << 40 | + ((long)(buffer[14] & 0xFF)) << 48 | + ((long)(buffer[15] & 0xFF)) << 56); + return new KMerPosition2(encoding, encoding2, baseIndex); + } + } + + private int readFully(InputStream stream, byte[] buffer, int offset, int count) + throws IOException { + int readCount = 0; + while (readCount < count) { + int read = stream.read(buffer, offset, count-readCount); + if (read <= 0) { + break; + } + offset += read; + readCount += read; + } + return readCount; + } + + private void writeKMerPosition(OutputStream stream, KMerPosition kmer) + throws IOException { + if (mUseOldFormat) { + writeKMerPositionOldFormat(stream, kmer); + return; + } + byte[] buffer = mIOBuffer; + int baseIndex = kmer.getBaseIndex(); + char[] encoding = kmer.getKMerEncoding(); + int offset = 0; + buffer[offset++] = (byte) ((baseIndex) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); + for (int i = 0; i < encoding.length; i++) { + buffer[offset++] = (byte) ((encoding[i]) & 0xFF); + buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); + } + stream.write(buffer, 0, offset); + } + + private void writeKMerPositionOldFormat(OutputStream stream, KMerPosition kmer) + throws IOException { + byte[] buffer = mIOBuffer; + long encoding1 = kmer.getKMerEncoding1(); + long encoding2 = kmer.getKMerEncoding2(); + int baseIndex = kmer.getBaseIndex(); + int offset = 0; + buffer[offset++] = (byte) ((encoding1) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 8) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 16) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 24) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 32) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 40) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 48) & 0xFF); + buffer[offset++] = (byte) ((encoding1 >> 56) & 0xFF); + if (mK >= 32) { + buffer[offset++] = (byte) ((encoding2) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 8) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 16) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 24) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 32) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 40) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 48) & 0xFF); + buffer[offset++] = (byte) ((encoding2 >> 56) & 0xFF); + } + buffer[offset++] = (byte) ((baseIndex) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); + stream.write(buffer, 0, offset); + } + + private long mergeSpillFiles(List spillFiles, File outputFile) + throws IOException { + + if (spillFiles == null) { + return 0; + } + + log("Merging spill files ..."); + OutputStream outputStream = + new BufferedOutputStream(new FileOutputStream(outputFile)); + long uniqueCount = 0; + int fileCount = spillFiles.size(); + InputStream[] inputStreams = new InputStream[fileCount]; + KMerPosition[] kmers = new KMerPosition[fileCount]; + for (int i = 0; i < fileCount; i++) { + inputStreams[i] = + new BufferedInputStream(new FileInputStream(spillFiles.get(i))); + } + while (true) { + for (int i = 0; i < fileCount; i++) { + if (kmers[i] == null && inputStreams[i] != null) { + kmers[i] = readKMerPosition(inputStreams[i]); + if (kmers[i] == null) { + inputStreams[i].close(); + inputStreams[i] = null; + } + } + } + int count = 0; + KMerPosition kmer = null; + for (int i = 0; i < fileCount; i++) { + KMerPosition kmp = kmers[i]; + if (kmp == null) { + continue; + } else if (kmer == null) { + kmer = kmp; + count = 1; + } else { + int cmp = kmp.compareTo(kmer); + if (cmp == 0) { + count++; + } else if (cmp < 0) { + kmer = kmp; + count = 1; + } + } + } + if (kmer == null) { + break; + } + for (int i = 0; i < fileCount; i++) { + if (kmers[i] != null && kmer.compareTo(kmers[i]) == 0) { + kmers[i] = null; + } + } + if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { + uniqueCount++; + writeKMerPosition(outputStream, kmer); + } + + } + outputStream.flush(); + outputStream.close(); + for (int i = 0; i < fileCount; i++) { + // spillFiles.get(i).delete(); + } + log("Spill files merged, unique count is " + uniqueCount); + return uniqueCount; + } + + private void writeKMerTextFile(File inputFile, + List exceptionList, + File outputFile) + throws IOException { + + log("Writing kmer file " + outputFile + " ..."); + int exceptionIndex = 0; + StringKMerPosition excKMer = null; + Iterator excIter = null; + if (!exceptionList.isEmpty()) { + excIter = exceptionList.iterator(); + excKMer = excIter.next(); + } + + InputStream inputStream = + new BufferedInputStream(new FileInputStream(inputFile)); + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + KMerPosition kmer = readKMerPosition(inputStream); + while (kmer != null || excKMer != null) { + if (excKMer == null) { + writeUniqueKMer(kmer, writer); + kmer = readKMerPosition(inputStream); + } else if (kmer == null) { + writeUniqueKMer(excKMer, writer); + excKMer = excIter.hasNext() ? excIter.next() : null; + } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { + writeUniqueKMer(kmer, writer); + kmer = readKMerPosition(inputStream); + } else { + writeUniqueKMer(excKMer, writer); + excKMer = excIter.hasNext() ? excIter.next() : null; + } + } + inputStream.close(); + writer.flush(); + writer.close(); + log("Wrote kmer file: " + outputFile); + } + + private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { + if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { + writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); + } + } + + private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { + if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { + writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); + } + } + + private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { + String chr = getBaseIndexSequenceName(baseIndex); + int pos = getBaseIndexCoordinate(baseIndex); + writer.println(kmer + "\t" + chr + "\t" + pos); + } + + private void createMapFile(int mapSize, + File kmerFile, + List exceptionList, + File priorMapFile, + File mapFile) + throws IOException { + byte[] map = null; + long uniquePriorCount = 0; + if (priorMapFile.exists()) { + map = readMapFile(priorMapFile); + if (map.length != mapSize) { + throw new RuntimeException("Prior map is wrong size"); + } + // Clear the new bits from prior map. + // Also count the prior unique positions while we are at it. + // Note that this is a count of positions, not kmers. + for (int i = 0; i < mapSize; i++) { + int cumBits = map[i] & 0x55; + uniquePriorCount += Integer.bitCount(cumBits); + map[i] = (byte) cumBits; + } + } else { + map = new byte[mapSize]; + } + for (StringKMerPosition kmp : exceptionList) { + addToMap(kmp, map); + } + mPriorMapUniqueCount = uniquePriorCount; + + InputStream inputStream = + new BufferedInputStream(new FileInputStream(kmerFile)); + while (true) { + KMerPosition kmp = readKMerPosition(inputStream); + if (kmp == null) { + inputStream.close(); + break; + } + addToMap(kmp, map); + } + + long testCum = 0; + for (int i = 0; i < map.length; i++) { + testCum += Integer.bitCount(map[i] & 0x55); + } + + writeMapFile(map, mapFile); + } + + private void addToMap(KMerPosition kmp, byte[] map) { + int baseIndex = kmp.getBaseIndex(); + if (baseIndex != NONUNIQUE_MARKER) { + addToMap(baseIndex, map); + } + } + + private void addToMap(StringKMerPosition kmp, byte[] map) { + int baseIndex = kmp.getBaseIndex(); + if (baseIndex != NONUNIQUE_MARKER) { + addToMap(baseIndex, map); + } + } + + private void addToMap(int baseIndex, byte[] map) { + int mod = baseIndex & 0x3; + int offset = (baseIndex >> 2) & 0x3FFFFFFF; + if (((map[offset] >> (2*mod)) & 0x3) != 0) { + throw new RuntimeException("Map entry already set: " + baseIndex); + } + map[offset] |= (0x3 << (2*mod)); + } + + private void writeSummaryStatistics(File outputFile) + throws IOException { + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; + long uniqueCount = mUniquePriorCount + mUniqueNewCount; + long nonUniqueCount = mKMerCount - uniqueCount; + writer.println("K: " + mK); + writer.println("Sequences: " + mSequenceList.size()); + writer.println("Bases: " + baseCount); + writer.println("KMers: " + mKMerCount); + writer.println("Prior map count: " + mPriorMapUniqueCount); + writer.println("Unique prior: " + mUniquePriorCount + + " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); + writer.println("Unique new: " + mUniqueNewCount + + " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); + writer.println("Unique cumulative: " + uniqueCount + + " (" + formatPercent(uniqueCount, mKMerCount) + ")"); + writer.println("Nonunique: " + nonUniqueCount + + " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); + writer.flush(); + writer.close(); + } + + private String formatPercent(long numerator, long denominator) { + double fraction = 0.0; + if (denominator != 0) { + fraction = numerator / (double) denominator; + } + return String.format("%1.1f%%", fraction * 100.0); + } + + private void openPriorMap(File mapFile) + throws IOException { + if (mapFile.exists()) { + mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); + mPriorMapPosition = -1; + mPriorMapValue = 0; + } + } + + private void closePriorMap() + throws IOException { + if (mPriorMapStream != null) { + mPriorMapStream.close(); + } + mPriorMapStream = null; + mPriorMapPosition = -1; + mPriorMapValue = 0; + } + + private byte[] readMapFile(File file) + throws IOException { + long fileLength = file.length(); + if (fileLength > 1000000000) { + throw new RuntimeException("Prior map too large: " + file); + } + int length = (int) fileLength; + byte[] map = new byte[length]; + FileInputStream stream = new FileInputStream(file); + int count = readFully(stream, map, 0, length); + if (count != length) { + throw new RuntimeException("Failed to read map: " + file); + } + stream.close(); + return map; + } + + private void writeMapFile(byte[] map, File file) + throws IOException { + FileOutputStream stream = new FileOutputStream(file); + stream.write(map); + stream.flush(); + stream.close(); + } + + private boolean isUniqueInPriorMap(int baseIndex) + throws IOException { + if (mPriorMapStream == null) { + return false; + } + int byteOffset = (baseIndex >> 2) & 0x3FFFFFFF; + if (byteOffset != mPriorMapPosition) { + int delta = byteOffset - mPriorMapPosition; + if (delta < 0) { + throw new RuntimeException("Attempt to seek backwards in prior map"); + } + if (delta > 1) { + skipFully(mPriorMapStream, delta-1); + } + mPriorMapValue = mPriorMapStream.read(); + if (mPriorMapValue < 0) { + throw new RuntimeException("Unexpected end of file in prior map"); + } + mPriorMapPosition += delta; + } + int mod = baseIndex & 0x3; + return (((mPriorMapValue >> (2*mod)) & 1) != 0); + } + + private void skipFully(InputStream stream, long amount) + throws IOException { + while (amount > 0) { + long skip = stream.skip(amount); + if (skip <= 0 || skip > amount) { + throw new RuntimeException("Skip failed"); + } + amount -= skip; + } + } + + private String getBaseIndexSequenceName(int baseIndex) { + int sequenceCount = mSequenceList.size(); + for (int i = 0; i < sequenceCount-1; i++) { + int nextOffset = mSequenceOffsetList.get(i+1); + if (compareBaseIndex(nextOffset, baseIndex) > 0) { + return mSequenceList.get(i); + } + } + return mSequenceList.get(sequenceCount-1); + } + + private int getBaseIndexCoordinate(int baseIndex) { + Integer sequenceOffset = null; + for (Integer offset : mSequenceOffsetList) { + if (compareBaseIndex(offset, baseIndex) > 0) { + break; + } + sequenceOffset = offset; + } + if (sequenceOffset == null) { + return 0; + } + int coordinate = baseIndex - sequenceOffset + 1; + if (coordinate <= 0) { + dumpSequenceList(); + System.out.println("coordinate: " + coordinate); + System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); + System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); + throw new RuntimeException("Internal error: illegal coordinate " + + coordinate + " for base index " + baseIndex); + } + return coordinate; + } + + private void dumpSequenceList() { + System.out.println("# Sequences:"); + int count = mSequenceList.size(); + for (int i = 0; i < count; i++) { + String seqName = mSequenceList.get(i); + int offset = mSequenceOffsetList.get(i); + System.out.println("# " + seqName + + "\t" + offset + + "\t" + Integer.toHexString(offset)); + } + } + + private int compareBaseIndex(int baseIndex1, int baseIndex2) { + // Implements unsigned comparison, a la compareTo + if (baseIndex1 < 0 ^ baseIndex2 < 0) { + return ((baseIndex1 < 0) ? 1 : -1); + } else { + return (baseIndex1 - baseIndex2); + } + } + + private String getNextSequence() + throws IOException { + + while (mNextSequence == null) { + if (mCurrentReader == null) { + mCurrentReader = getNextReader(); + if (mCurrentReader == null) { + return null; + } + } + String line = mCurrentReader.readLine(); + if (line == null) { + mCurrentReader.close(); + mCurrentReader = null; + continue; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + } + } + String result = mNextSequence; + mNextSequence = null; + return result; + } + + private LineNumberReader getNextReader() + throws IOException { + if (mInputFileIndex >= mInputFiles.size()) { + return null; + } + File file = mInputFiles.get(mInputFileIndex++); + return new LineNumberReader(new FileReader(file)); + } + + private char[] getNextKMer() + throws IOException { + + if (mKMerBuffer == null) { + mKMerBuffer = new char[mK]; + } + System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); + if (mKMerBufferedCount > 0) { + mKMerBufferedCount--; + } + + while (mKMerBufferedCount < mK) { + char base = getNextBase(); + if (base == 0) { + incrementBaseIndex(mKMerBufferedCount); + mKMerBufferedCount = 0; + return null; + } else if (base == 'N') { + incrementBaseIndex(mKMerBufferedCount+1); + mKMerBufferedCount = 0; + } else { + mKMerBuffer[mKMerBufferedCount++] = base; + } + } + incrementBaseIndex(1); + return mKMerBuffer; + } + + private char getNextBase() + throws IOException { + + if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { + if (mCurrentReader == null) { + return 0; + } + String line = mCurrentReader.readLine(); + if (line == null) { + mLineBuffer = null; + mLineBufferIndex = 0; + mCurrentReader.close(); + mCurrentReader = null; + return 0; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + mLineBuffer = null; + mLineBufferIndex = 0; + return 0; + } + mLineBuffer = line.toUpperCase(); + mLineBufferIndex = 0; + } + return mLineBuffer.charAt(mLineBufferIndex++); + } + + private void incrementBaseIndex(int amount) { + if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { + throw new RuntimeException("Base index: 32-bit overflow"); + } + mBaseIndex += amount; + } + + private void log(String text) { + if (mVerbose) { + System.out.println("# " + new Date() + " " + text); + } + } + + private static void dbg(String text) { + System.out.println("#DBG: " + text); + } + + private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { + if (mUseOldFormat) { + return encodeKMerOldFormat(kmerChars, baseIndex); + } + char[] encoding = encodeKMerChars(kmerChars); + if (encoding == null) { + return null; + } + char[] reverseEncoding = encodeKMerChars(reverseComplement(kmerChars)); + if (compareEncodings(encoding, reverseEncoding) <= 0) { + return new KMerPositionN(encoding, baseIndex); + } else { + KMerPositionN kmp = new KMerPositionN(reverseEncoding, baseIndex); + kmp.setIsReversed(true); + return kmp; + } + } + + private static char[] encodeKMerChars(char[] kmerChars) { + if (kmerChars == null) { + return null; + } + + int kmerLength = kmerChars.length; + int encodingLength = (kmerLength + 7) / 8; + char[] encoding = new char[encodingLength]; + int offset = kmerLength % 8; + offset = (offset == 0) ? 8 : offset; + int bits = encodeKMerBits(kmerChars, 0, offset); + if (bits < 0) { + return null; + } + encoding[0] = (char) bits; + for (int i = 1; i < encodingLength; i++) { + bits = encodeKMerBits(kmerChars, offset, 8); + if (bits < 0) { + return null; + } + encoding[i] = (char) bits; + offset += 8; + } + return encoding; + } + + private static int compareEncodings(char[] encoding1, char[] encoding2) { + int length = Math.max(encoding1.length, encoding2.length); + for (int i = 0; i < length; i++) { + int result = encoding1[i] - encoding2[i]; + if (result != 0) { + return result; + } + } + return 0; + } + + private static KMerPosition encodeKMerOldFormat(char[] kmerChars, int baseIndex) { + if (kmerChars == null) { + return null; + } + int length = kmerChars.length; + if (length <= 31) { + long bits = encodeKMerBitsLong(kmerChars, 0, length); + if (bits == -1) { + return null; + } + return new KMerPosition1(bits, baseIndex); + } else if (length <= 62) { + long bits1 = encodeKMerBitsLong(kmerChars, 0, 31); + long bits2 = encodeKMerBitsLong(kmerChars, 31, length - 31); + if (bits1 == -1 || bits2 == -1) { + return null; + } + return new KMerPosition2(bits1, bits2, baseIndex); + } else { + return null; + } + } + + private static int encodeKMerBits(char[] kmerChars, int offset, int length) { + int bits = 0; + for (int i = 0; i < length; i++) { + char base = kmerChars[offset + i]; + int baseBits = "ACGT".indexOf(base); + if (baseBits < 0) { + return -1; + } + bits |= baseBits << (2*(length-i-1)); + } + return bits; + } + + private static long encodeKMerBitsLong(char[] kmerChars, int offset, int length) { + long bits = 0; + for (int i = 0; i < length; i++) { + char base = kmerChars[offset + i]; + int baseBits = "ACGT".indexOf(base); + if (baseBits < 0) { + return -1; + } + bits |= ((long)baseBits) << (2*(length-i-1)); + } + return bits; + } + + private static String decodeKMer1(long bits) { + int length = mK; + char[] buffer = new char[length]; + decodeKMerBits(bits, buffer, 0, length); + return new String(buffer); + } + + private static String decodeKMer2(long bits1, long bits2) { + int length = mK; + char[] buffer = new char[length]; + decodeKMerBits(bits1, buffer, 0, 31); + decodeKMerBits(bits2, buffer, 31, length-31); + return new String(buffer); + } + + private static String decodeKMerN(char[] encoding, boolean reverse) { + int length = mK; + char[] buffer = new char[length]; + int offset = length % 8; + offset = (offset == 0) ? 8 : offset; + decodeKMerBits(encoding[0], buffer, 0, offset); + for (int i = 1; i < encoding.length; i++) { + decodeKMerBits(encoding[i], buffer, offset, 8); + offset += 8; + } + if (reverse) { + reverseComplementInPlace(buffer); + } + return new String(buffer); + } + + private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { + for (int i = 0; i < length; i++) { + int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); + buffer[offset + i] = "ACGT".charAt(baseBits); + } + } + + private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { + for (int i = 0; i < length; i++) { + int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); + buffer[offset + i] = "ACGT".charAt(baseBits); + } + } + + private static char[] reverseComplement(char[] buffer) { + int length = buffer.length; + char[] result = new char[length]; + System.arraycopy(buffer, 0, result, 0, length); + reverseComplementInPlace(result); + return result; + } + + private static void reverseComplementInPlace(char[] buffer) { + int length = buffer.length; + int limit = (length + 1)/2; + for (int i = 0; i < limit; i++) { + char ch1 = reverseComplement(buffer[i]); + char ch2 = reverseComplement(buffer[length-i-1]); + buffer[i] = ch2; + buffer[length-i-1] = ch1; + } + } + + private static char reverseComplement(char base) { + switch (base) { + case 'A': + return 'T'; + case 'C': + return 'G'; + case 'G': + return 'C'; + case 'T': + return 'A'; + } + return base; + } + + private static String formatEncoding(char[] encoding) { + if (encoding == null) { + return null; + } + StringBuilder builder = new StringBuilder(); + builder.append('['); + for (int i = 0; i < encoding.length; i++) { + String hex = Integer.toHexString(encoding[i]); + int length = hex.length(); + while (length < 4) { + builder.append('0'); + length++; + } + builder.append(hex); + } + builder.append(']'); + return builder.toString(); + } + + static class KMerPosition + implements Comparable { + + private int mBaseIndex; + + KMerPosition(int baseIndex) { + mBaseIndex = baseIndex; + } + + public String getKMer() { + return null; + } + + public long getKMerEncoding1() { + return -1; + } + + public long getKMerEncoding2() { + return -1; + } + + public final int getBaseIndex() { + return mBaseIndex; + } + + public final void setBaseIndex(int baseIndex) { + mBaseIndex = baseIndex; + } + + public char[] getKMerEncoding() { + return null; + } + + public int compareTo(KMerPosition kmp) { + return compareEncodings(getKMerEncoding(), kmp.getKMerEncoding()); + } + + public boolean equals(Object object) { + if (!(object instanceof KMerPosition)) { + return false; + } + KMerPosition kmp = (KMerPosition) object; + return (getBaseIndex() == kmp.getBaseIndex() && + this.compareTo(kmp) == 0); + } + + public String format() { + return(getKMer() + + " " + formatEncoding(getKMerEncoding()) + + " " + Integer.toHexString(mBaseIndex)); + } + } + + static class KMerPosition1 + extends KMerPosition { + + private long mKMerEncoding1; + + KMerPosition1(long kmer, int baseIndex) { + super(baseIndex); + mKMerEncoding1 = kmer; + } + + public String getKMer() { + return decodeKMer1(getKMerEncoding1()); + } + + public final long getKMerEncoding1() { + return mKMerEncoding1; + } + + public int compareTo(KMerPosition kmp) { + int result = Long.signum(getKMerEncoding1() - kmp.getKMerEncoding1()); + if (result == 0) { + result = Long.signum(getKMerEncoding2() - kmp.getKMerEncoding2()); + } + return result; + } + } + + static class KMerPosition2 + extends KMerPosition1 { + + private long mKMerEncoding2; + + KMerPosition2(long encoding1, long encoding2, int baseIndex) { + super(encoding1, baseIndex); + mKMerEncoding2 = encoding2; + } + + public String getKMer() { + return decodeKMer2(getKMerEncoding1(), getKMerEncoding2()); + } + + public final long getKMerEncoding2() { + return mKMerEncoding2; + } + } + + static class KMerPositionN + extends KMerPosition { + + private boolean mReversed; + private char[] mKMerEncoding; + + KMerPositionN(char[] encoding, int baseIndex) { + super(baseIndex); + mReversed = false; + mKMerEncoding = encoding; + } + + public boolean getIsReversed() { + return mReversed; + } + + public void setIsReversed(boolean value) { + mReversed = value; + } + + public String getKMer() { + return decodeKMerN(mKMerEncoding, mReversed); + } + + public final char[] getKMerEncoding() { + return mKMerEncoding; + } + + public String format() { + return(getKMer() + + " " + formatEncoding(getKMerEncoding()) + + " " + (mReversed ? 'R' : 'F') + + " " + Integer.toHexString(getBaseIndex())); + } + } + + static class StringKMerPosition + implements Comparable { + + private String mKMerString = null; + private int mBaseIndex; + + StringKMerPosition(String kmer, int baseIndex) { + mKMerString = kmer; + mBaseIndex = baseIndex; + } + + public final String getKMer() { + return mKMerString; + } + + public final int getBaseIndex() { + return mBaseIndex; + } + + public final void setBaseIndex(int baseIndex) { + mBaseIndex = baseIndex; + } + + public int compareTo(StringKMerPosition kmp) { + return mKMerString.compareTo(kmp.mKMerString); + } + + public boolean equals(Object object) { + if (!(object instanceof StringKMerPosition)) { + return false; + } + StringKMerPosition kmp = (StringKMerPosition) object; + return (mBaseIndex == kmp.mBaseIndex && + mKMerString.equals(kmp.mKMerString)); + } + } +} diff --git a/lib/edu/mit/broad/cnv/GatherAlignments.java b/lib/edu/mit/broad/cnv/GatherAlignments.java new file mode 100644 index 0000000000..b0dc2d5afd --- /dev/null +++ b/lib/edu/mit/broad/cnv/GatherAlignments.java @@ -0,0 +1,399 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv; + +import edu.mit.broad.arachne.Alignment; +import edu.mit.broad.arachne.LookAlignReader; + +import java.io.*; +import java.util.*; + +/** + * Utility program to gather CNV alignments from LookAlign files in an I/O efficient manner. + */ +public class GatherAlignments { + + public static void main(String[] args) + throws Exception { + new GatherAlignments().run(args); + } + + private void usage() { + System.out.println("Usage: GatherAlignments ..."); + System.out.println(" -cnpList "); + System.out.println(" -sampleId "); + System.out.println(" -inputFileList "); + System.out.println(" -outputDirectory "); + System.out.println(" -padding "); + System.out.println(" -bestAlignments"); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-cnpList") && argsleft > 1) { + argpos++; + mCnpListPath = args[argpos++]; + } else if (arg.equals("-sampleId") && argsleft > 1) { + argpos++; + mSampleId = args[argpos++]; + } else if (arg.equals("-inputFileList") && argsleft > 1) { + argpos++; + mInputFileListPath = args[argpos++]; + } else if (arg.equals("-outputDirectory") && argsleft > 1) { + argpos++; + mOutputDirectory = args[argpos++]; + } else if (arg.equals("-padding") && argsleft > 1) { + argpos++; + mCnpRegionPadding = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-bestAlignments")) { + argpos++; + mReturnBestHits = true; + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 0) { + usage(); + return false; + } + + return true; + } + + private void run(String[] args) + throws Exception { + + if (!parseArguments(args)) { + System.exit(1); + } + + List mInputFileList = parseInputFiles(mInputFileListPath); + Map> mCnpMap = parseCnpFile(mCnpListPath); + for (File inputFile : mInputFileList) { + scanInputFile(inputFile, mCnpMap); + } + } + + private List parseInputFiles(String path) + throws IOException { + List fileList = new ArrayList(); + LineNumberReader reader = new LineNumberReader(new FileReader(path)); + while (true) { + String line = reader.readLine(); + if (line == null) { + reader.close(); + break; + } + line = line.trim(); + if (line.length() == 0 || line.startsWith("#")) { + continue; + } + String[] fields = line.split("\\s+"); + fileList.add(new File(fields[0])); + } + return fileList; + } + + private Map> parseCnpFile(String path) + throws IOException { + Map> cnpMap = new HashMap>(); + LineNumberReader reader = new LineNumberReader(new FileReader(path)); + while (true) { + String line = reader.readLine(); + if (line == null) { + reader.close(); + break; + } + line = line.trim(); + if (line.length() == 0 || line.startsWith("#")) { + continue; + } + String[] fields = line.split("\\s+"); + if (fields.length != 4) { + throw new RuntimeException("Invalid CNP line: " + line); + } + if (fields[0].equalsIgnoreCase("CNPID")) { + continue; + } + String cnpId = fields[0]; + String chromosome = fields[1]; + int start = Integer.parseInt(fields[2].replaceAll(",", "")); + int end = Integer.parseInt(fields[3].replaceAll(",", "")); + int sequenceId = chromosomeToSequenceId(chromosome); + if (sequenceId < 0) { + throw new RuntimeException("Unrecognized chromosome: " + chromosome); + } + if (mCnpRegionPadding > 0) { + start = Math.max(1, start - mCnpRegionPadding); + end = end + mCnpRegionPadding; + } + CnpRegion cnp = new CnpRegion(cnpId, sequenceId, start, end); + List cnpList = cnpMap.get(sequenceId); + if (cnpList == null) { + cnpList = new ArrayList(); + cnpMap.put(sequenceId, cnpList); + } + cnpList.add(cnp); + } + return cnpMap; + } + + private int chromosomeToSequenceId(String text) { + if (text == null || text.length() == 0) { + return -1; + } + if (text.matches("\\d+")) { + return Integer.parseInt(text); + } + if (text.startsWith("chr") && text.length() > 3) { + text = text.substring(3); + } + if (text.matches("\\d+") && !text.startsWith("0")) { + return Integer.parseInt(text); + } + if (text.equals("M")) { + return 0; + } else if (text.equals("X")) { + return 23; + } else if (text.equals("Y")) { + return 24; + } else { + return -1; + } + } + + private void scanInputFile(File inputFile, + Map> cnpMap) + throws IOException { + LookAlignReader reader = new LookAlignReader(inputFile); + while (true) { + Alignment alignment = getNextAlignment(reader); + if (alignment == null) { + reader.close(); + break; + } + List cnpList = cnpMap.get(alignment.getBSequenceId()); + if (cnpList == null) { + continue; + } + for (CnpRegion cnp : cnpList) { + if (overlaps(cnp, alignment)) { + saveCnpAlignment(cnp, alignment, inputFile); + } + } + } + flushCnpAlignments(inputFile); + } + + private Alignment getNextAlignment(LookAlignReader reader) + throws IOException { + if (!mReturnBestHits) { + if (reader.hasNext()) { + return reader.next(); + } else { + return null; + } + } + while (true) { + Alignment seed = mPendingAlignment; + mPendingAlignment = null; + if (seed == null && reader.hasNext()) { + seed = reader.next(); + } + if (seed == null) { + return null; + } + List secondaryHits = null; + while (reader.hasNext()) { + Alignment alignment = reader.next(); + if (alignment.getASequenceId() != seed.getASequenceId()) { + if (alignment.getASequenceId() < seed.getASequenceId()) { + throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); + } + mPendingAlignment = alignment; + break; + } + if (secondaryHits == null) { + secondaryHits = new ArrayList(); + } + secondaryHits.add(alignment); + } + if (secondaryHits == null) { + return seed; + } + secondaryHits.add(seed); + Alignment result = getUniqueBestAlignment(secondaryHits); + if (result != null) { + return result; + } + } + } + + private Alignment getUniqueBestAlignment(List alignments) { + int bestMismatches = 0; + List best = new ArrayList(); + for (Alignment a : alignments) { + int mismatches = getAlignmentMismatches(a); + if (best.isEmpty()) { + best.add(a); + bestMismatches = mismatches; + } + if (mismatches == bestMismatches) { + best.add(a); + } else if (mismatches < bestMismatches) { + best.clear(); + best.add(a); + bestMismatches = mismatches; + } + } + if (best.size() != 1) { + return null; + } + return best.get(0); + } + + private int getAlignmentMismatches(Alignment alignment) { + int mismatches = 0; + int[] blocks = alignment.getAlignmentBlocks(); + for (int i = 0; i < blocks.length; i += 3) { + int gap = blocks[i]; + int duration = blocks[i+1]; + int mm = blocks[i+2]; + if (mm > duration) { + throw new RuntimeException("Invalid alignment? : " + alignment.format()); + } + mismatches += Math.abs(gap); + mismatches += mm; + } + return mismatches; + } + + private boolean overlaps(CnpRegion cnp, Alignment alignment) { + return (cnp.getSequenceId() == alignment.getBSequenceId() && + cnp.getStart() <= alignment.getBEnd() && + cnp.getEnd() >= alignment.getBStart()); + } + + private void saveCnpAlignment(CnpRegion cnp, Alignment alignment, File inputFile) + throws IOException { + if (mCnpAlignmentCount > mCnpAlignmentLimit) { + flushCnpAlignments(inputFile); + } + String cnpId = cnp.getCnpId(); + List alignmentList = mCnpAlignmentMap.get(cnpId); + if (alignmentList == null) { + alignmentList = new ArrayList(); + mCnpAlignmentMap.put(cnpId, alignmentList); + } + alignmentList.add(alignment); + mCnpAlignmentCount++; + } + + private void flushCnpAlignments(File inputFile) + throws IOException { + while (!mCnpAlignmentMap.isEmpty()) { + String cnpId = mCnpAlignmentMap.keySet().iterator().next(); + List alignmentList = mCnpAlignmentMap.get(cnpId); + writeAlignments(cnpId, mSampleId, alignmentList, inputFile); + mCnpAlignmentMap.remove(cnpId); + mCnpAlignmentCount -= alignmentList.size(); + } + if (mCnpAlignmentCount != 0) { + throw new RuntimeException("Unsynchronized alignment count"); + } + } + + private void writeAlignments(String cnpId, String sampleId, List alignmentList, File inputFile) + throws IOException { + File outputDir = new File("."); + if (mOutputDirectory != null) { + outputDir = new File(mOutputDirectory); + } + String cnpSample = cnpId; + if (sampleId != null) { + cnpSample = cnpSample + "_" + sampleId; + } + File cnpSampleDir = new File(outputDir, cnpSample); + if (!cnpSampleDir.exists()) { + if (!cnpSampleDir.mkdir()) { + throw new RuntimeException("Failed to create directory " + cnpSampleDir); + } + } + String fileName = inputFile.getName(); + File alignmentFile = new File(cnpSampleDir, fileName); + PrintWriter writer = new PrintWriter(new FileWriter(alignmentFile, true)); + for (Alignment alignment : alignmentList) { + writer.println(alignment.arachneFormat()); + } + writer.flush(); + writer.close(); + } + + private GatherAlignments() { + } + + private static class CnpRegion { + + private CnpRegion(String cnpId, int sequenceId, int start, int end) { + mCnpId = cnpId; + mSequenceId = sequenceId; + mStart = start; + mEnd = end; + } + + public String getCnpId() { return mCnpId; }; + public int getSequenceId() { return mSequenceId; }; + public int getStart() { return mStart; }; + public int getEnd() { return mEnd; }; + + private String mCnpId; + private int mSequenceId; + private int mStart; + private int mEnd; + } + + private boolean mDebug = false; + private boolean mVerbose = false; + + private boolean mReturnBestHits = false; + private String mCnpListPath = null; + private String mSampleId = null; + private String mInputFileListPath = null; + private String mOutputDirectory = null; + private int mCnpRegionPadding = 0; + + private Alignment mPendingAlignment = null; + private int mCnpAlignmentCount = 0; + private int mCnpAlignmentLimit = 1000000; + private Map> mCnpAlignmentMap = new LinkedHashMap>(); +} + + + diff --git a/lib/edu/mit/broad/cnv/kmer/CountKMers.java b/lib/edu/mit/broad/cnv/kmer/CountKMers.java new file mode 100644 index 0000000000..23b9d6af4b --- /dev/null +++ b/lib/edu/mit/broad/cnv/kmer/CountKMers.java @@ -0,0 +1,1494 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv.kmer; + + +import edu.mit.broad.cnv.util.SequenceIterator; + +import java.io.*; +import java.util.*; + + +/** + * Tool for counting unique kmers. + */ +public class CountKMers +{ + private static final int NONUNIQUE_MARKER = -1; + + private String mAction = null; + private static int mK = 0; + private int mMinimumK = 0; + private int mMaximumK = 0; + private int mBatchSize = 0; + private List mInputFiles = null; + private File mSearchFile = null; + private String mSequenceName = null; + private File mInputDirectory = null; + private File mOutputDirectory = null; + private boolean mRunDistributed = false; + private int mDistributedWorkerCount = 0; + private boolean mVerbose = false; + private boolean mDebug = false; + + private List mSequenceList = null; + private List mSequenceOffsetList = null; + private List mSpillFileList = null; + private double mSpillFactor = 0.9; + + private long mKMerCount = 0; + private long mUniquePriorCount = 0; + private long mUniqueNewCount = 0; + private long mPriorMapUniqueCount = 0; + + private InputStream mPriorMapStream = null; + private int mPriorMapPosition = -1; + private int mPriorMapValue = 0; + private int mInputFileIndex = 0; + private LineNumberReader mCurrentReader = null; + private String mNextSequence = null; + private char[] mKMerBuffer = null; + private int mKMerBufferedCount = 0; + private String mLineBuffer = null; + private int mLineBufferIndex = 0; + private int mBaseIndex = -1; + private byte[] mIOBuffer = null; + + /* Design + Inputs: + - One or more fasta files to search (currently one). + - Output directory for the result files. + - Optionally an input k-1-mer file (output from previous pass). + Outputs: + - Unique kmer file: (sorted by kmer) + This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). + - Per chromosome bit map: pos (implicit) new-bit cum-bit + New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. + Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. + - Statistics + Plan: + - Reducing memory footprint is crucial. + - Sequential pass over the input sequences to generate kmers. + - BatchSize kmers are cached in memory, then sorted and uniqified. + - As batch array fills, batches are spilled to disk. + - Batches are reloaded from disk and merged (N-finger algorithm) + - and streamed to a merge file. + - Merge file is read from disk and processed as final results. + */ + + public static void main(String[] args) + throws Exception { + new CountKMers().run(args); + } + + private void usage() { + System.out.println("Usage: CountKMers ..."); + System.out.println(" -action "); + System.out.println(" -genome "); + System.out.println(" -chromosome "); + System.out.println(" -k "); + System.out.println(" -minK "); + System.out.println(" -maxK "); + System.out.println(" -batchSize "); + System.out.println(" -inputDir "); + System.out.println(" -outputDir "); + System.out.println(" -distributed"); + System.out.println(" -workers "); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-action") && argsleft > 1) { + argpos++; + mAction = args[argpos++]; + } else if (arg.equals("-genome") && argsleft > 1) { + argpos++; + if (mInputFiles == null) { + mInputFiles = new ArrayList(); + } + mInputFiles.add(new File(args[argpos++])); + } else if (arg.equals("-chromosome") && argsleft > 1) { + argpos++; + mSequenceName = args[argpos++]; + } else if (arg.equals("-k") && argsleft > 1) { + argpos++; + mK = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-minK") && argsleft > 1) { + argpos++; + mMinimumK = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-maxK") && argsleft > 1) { + argpos++; + mMaximumK = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-batchSize") && argsleft > 1) { + argpos++; + mBatchSize = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-inputDir") && argsleft > 1) { + argpos++; + mInputDirectory = new File(args[argpos++]); + } else if (arg.equals("-outputDir") && argsleft > 1) { + argpos++; + mOutputDirectory = new File(args[argpos++]); + } else if (arg.equals("-searchFile") && argsleft > 1) { + argpos++; + mSearchFile = new File(args[argpos++]); + } else if (arg.equals("-distributed")) { + argpos++; + mRunDistributed = true; + } else if (arg.equals("-workers") && argsleft > 1) { + argpos++; + mDistributedWorkerCount = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 0) { + usage(); + return false; + } + + return true; + } + + private void run(String[] args) + throws Exception { + if (!parseArguments(args)) { + System.exit(1); + } + if (mAction == null || mAction.equals("mapKMers")) { + if (mRunDistributed) { + mapKMersDistributed(); + } else { + mapKMers(); + } + } else if (mAction.equals("mapGaps")) { + mapGaps(); + } else if (mAction.equals("rollUp")) { + rollUp(); + } else if (mAction.equals("search")) { + search(); + } + } + + private void search() + throws IOException { + char[][] searchStrings = loadSearchFile(mSearchFile); + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + int position = 0; + log("Scanning " + seqName + " ..."); + while (true) { + char[] kmerChars = getNextKMer(); + if (kmerChars == null) { + break; + } + position++; + for (int i = 0; i < searchStrings.length; i++) { + if (Arrays.equals(searchStrings[i], kmerChars)) { + String kmer = new String(searchStrings[i]); + String strand = ((i % 2) == 0) ? "F" : "R"; + System.out.println(kmer + "\t" + seqName + "\t" + position + "\t" + strand); + } + } + } + } + } + + private char[][] loadSearchFile(File file) + throws IOException { + List list = new ArrayList(); + LineNumberReader reader = new LineNumberReader(new FileReader(file)); + while (true) { + String line = reader.readLine(); + if (line == null) { + reader.close(); + break; + } + String text = line.trim(); + if (text.length() == 0 || text.startsWith("#")) { + continue; + } + String[] fields = text.split("\\s+"); + char[] kmer = fields[0].toUpperCase().toCharArray(); + list.add(kmer); + list.add(reverseComplement(kmer)); + } + return list.toArray(new char[0][0]); + } + + // Can be used to scan genome for sequence names/lengths. + private void scanKMers() + throws IOException { + mSequenceList = new ArrayList(); + mSequenceOffsetList = new ArrayList(); + File priorMapFile = + new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); + openPriorMap(priorMapFile); + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + mSequenceList.add(seqName); + mSequenceOffsetList.add(mBaseIndex+1); + log("Scanning " + seqName + " ..."); + while (true) { + char[] kmerChars = getNextKMer(); + if (kmerChars == null) { + break; + } + mKMerCount++; + if (isUniqueInPriorMap(mBaseIndex)) { + continue; + } + } + } + closePriorMap(); + } + + private void mapGaps() + throws IOException { + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + int pos = 0; + int gapStart = 0; + while (true) { + char base = getNextBase(); + if (base == 0) { + break; + } + pos++; + if (base == 'N') { + if (gapStart == 0) { + gapStart = pos; + } + } else { + if (gapStart > 0) { + System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); + gapStart = 0; + } + } + } + if (gapStart > 0) { + System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); + gapStart = 0; + } + } + } + + private void rollUp() + throws IOException { + // Roll up based on the middle of the reads. + File[] mapFiles = getAllMapFiles(); + if (mapFiles.length > 127) { + throw new RuntimeException("K to large for byte sized counts"); + } + SequenceIterator seqIterator = new SequenceIterator(mInputFiles); + while (true) { + String seqName = seqIterator.getNextSequence(); + if (seqName == null) { + break; + } + if (mSequenceName != null && !mSequenceName.equals(seqName)) { + continue; + } + log("Rolling up sequence " + seqName + " ..."); + int seqBaseIndex = seqIterator.getBaseIndex() + 1; + char[] seqChars = loadSequence(seqIterator); + int seqLength = seqChars.length; + int seqMapOffset = (seqBaseIndex >> 3) & 0x1FFFFFFF; + int seqMapModulus = (seqBaseIndex & 0x7); + int seqMapLength = (seqMapModulus + seqLength + 7)/8; + // log(" seqLength = " + seqLength); + // log(" baseIndex = " + Integer.toHexString(seqBaseIndex) + // + " (" + (((long)seqBaseIndex) & 0xFFFFFFFFL) + ")"); + // log(" seqMapOffset = " + seqMapOffset); + // log(" seqMapLength = " + seqMapLength); + byte[] counts = new byte[seqLength]; + for (int pos = 1; pos <= seqLength; pos++) { + if (seqChars[pos-1] == 'N') { + counts[pos-1] = -1; + } + } + for (int k = 1; k <= mapFiles.length; k++) { + if (mapFiles[k-1] == null) { + continue; + } + log("Processing map file " + mapFiles[k-1] + " ..."); + byte[] kmerMap = readMapFileRegion(mapFiles[k-1], seqMapOffset, seqMapLength); + for (int pos = 1; pos <= seqLength; pos++) { + if (counts[pos-1] != 0) { + continue; + } else if (isNearContigBoundary(pos, seqChars, k)) { + counts[pos-1] = -1; + } else { + int baseOffset = pos - (k+1)/2; + int mapIndex = seqMapModulus + baseOffset; + if (isUniqueInMap(kmerMap, mapIndex)) { + counts[pos-1] = (byte) k; + } + } + } + } + File outputFile = + new File(mOutputDirectory, "rollup_" + seqName + ".bin"); + writeRollUpFile(outputFile, counts); + } + } + + private boolean isNearContigBoundary(int pos, char[] seqChars, int k) { + int windowStart = pos - (k-1)/2; + int windowEnd = pos + k/2; + if (windowStart < 1 || windowEnd > seqChars.length) { + return true; + } + for (int i = windowStart-1; i < windowEnd; i++) { + if (seqChars[i] == 'N') { + return true; + } + } + return false; + } + + private void writeRollUpFile(File file, byte[] counts) + throws IOException { + FileOutputStream stream = new FileOutputStream(file); + stream.write(counts); + stream.flush(); + stream.close(); + if (mDebug) { + PrintWriter writer = new PrintWriter(file + ".dbg"); + for (int i = 0; i < counts.length; i++) { + writer.println(counts[i]); + } + writer.flush(); + writer.close(); + } + } + + /** + * Returns an array of files, indexed by K, + * where the array index = K-1 (i.e. K=1 is the first file). + * If there is no file for index K, then the array element is null. + */ + private File[] getAllMapFiles() { + int maxK = mMaximumK; + if (maxK == 0) { + // Safe upper bound + maxK = 1000; + } + List fileList = new ArrayList(); + for (int k = 1; k <= maxK; k++) { + if (mMinimumK > 0 && k < mMinimumK) { + continue; + } + File mapFile = + new File(mInputDirectory, "unique_" + k + "_mers_map.bin"); + if (mapFile.exists()) { + while (fileList.size() < k-1) { + fileList.add(null); + } + fileList.add(mapFile); + } else { + if (mMaximumK == 0 && !fileList.isEmpty()) { + break; + } + } + } + File[] result = new File[fileList.size()]; + result = fileList.toArray(result); + if (mDebug) { + for (int i = 0; i < result.length; i++) { + debug("mapFiles[k=" + (i+1) + "] = " + result[i]); + } + } + return result; + } + + private char[] loadSequence(SequenceIterator seqIterator) + throws IOException { + StringBuilder builder = new StringBuilder(); + while (true) { + char ch = seqIterator.getNextBase(); + if (ch == 0) { + break; + } + builder.append(ch); + } + char[] result = new char[builder.length()]; + builder.getChars(0, builder.length(), result, 0); + return result; + } + + private void mapKMersDistributed() + throws Exception { + DistributedKMerCounter algorithm = new DistributedKMerCounter(); + algorithm.setDebug(mDebug); + algorithm.setVerbose(mVerbose); + algorithm.setInputFiles(mInputFiles); + algorithm.setK(mK); + algorithm.setMaximumWorkerCount(mDistributedWorkerCount); + // algorithm.setLsfQueue(mLsfQueue); + // algorithm.setLsfLogDirectory(mLsfLogDirectory); + // algorithm.setEnableGcLogging(mEnableGcLogging); + algorithm.run(); + } + + private void mapKMers() + throws IOException { + + File textKMerFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); + File binaryKMerFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); + File exceptionFile = + new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); + File mapFile = + new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); + File priorMapFile = + new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); + File statsFile = + new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); + + if (mBatchSize == 0) { + throw new RuntimeException("Batch size not specified"); + } + + int kmerCount = 0; + int batchSize = mBatchSize; + KMerPosition[] kmerArray = new KMerPosition[batchSize]; + List exceptionList = new ArrayList(); + mSequenceList = new ArrayList(); + mSequenceOffsetList = new ArrayList(); + mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; + + openPriorMap(priorMapFile); + + while (true) { + String seqName = getNextSequence(); + if (seqName == null) { + break; + } + mSequenceList.add(seqName); + mSequenceOffsetList.add(mBaseIndex+1); + log("Processing " + seqName + " ..."); + while (true) { + char[] kmerChars = getNextKMer(); + if (kmerChars == null) { + break; + } + mKMerCount++; + int baseIndex = mBaseIndex; + if (isUniqueInPriorMap(baseIndex)) { + mUniquePriorCount++; + continue; + } + KMerPosition kmp = encodeKMer(kmerChars, baseIndex); + if (kmp == null) { + // Note: We currently do not handle the reverse + // complement of exception characters correctly. + // For hg18, however, this doesn't matter as + // none of the kmers containing non-ACGT characters + // are present on the reverse strand. + String kmer = new String(kmerChars); + exceptionList.add(new StringKMerPosition(kmer, baseIndex)); + continue; + } + kmerArray[kmerCount++] = kmp; + if (kmerCount == batchSize) { + kmerCount = compactKMers(kmerArray, kmerCount); + if (kmerCount > mSpillFactor * batchSize) { + spillKMers(kmerArray, kmerCount); + kmerCount = 0; + } + } + } + } + if (kmerCount > 0) { + kmerCount = compactKMers(kmerArray, kmerCount); + if (mSpillFileList != null) { + spillKMers(kmerArray, kmerCount); + kmerCount = 0; + } + } + + closePriorMap(); + + // Write out the exception kmers (text file). + compactKMers(exceptionList); + writeExceptionFile(exceptionList, exceptionFile); + + // Write out the binary file of unique encoded kmers. + if (mSpillFileList == null) { + kmerCount = removeNonUnique(kmerArray, kmerCount); + writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); + mUniqueNewCount = kmerCount; + } else { + mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); + } + mUniqueNewCount += countUniqueKMers(exceptionList); + + // Write out the text file of (all) unique kmers. + writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); + + // Create map file from prior map plus the new unique kmers. + long mapSize = (mBaseIndex + 1) & 0xFFFFFFFFL; + createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); + + // Write summary statistics file. + writeSummaryStatistics(statsFile); + } + + private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { + if (kmerCount == 0) { + return 0; + } + log("Compacting " + kmerCount + " kmers at index " + + Integer.toHexString(mBaseIndex) + " ..."); + Arrays.sort(kmerArray, 0, kmerCount); + int newCount = 1; + KMerPosition current = kmerArray[0]; + for (int i = 1; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + if (current.compareTo(kmp) == 0) { + current.setBaseIndex(NONUNIQUE_MARKER); + } else { + kmerArray[newCount++] = kmp; + current = kmp; + } + } + log("Compaction finished, new count is " + newCount); + return newCount; + } + + private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { + if (kmerCount == 0) { + return 0; + } + log("Compacting " + kmerCount + " string kmers ..."); + Arrays.sort(kmerArray, 0, kmerCount); + int newCount = 1; + String kmerString = kmerArray[0].getKMer(); + for (int i = 1; i < kmerCount; i++) { + StringKMerPosition kmp = kmerArray[i]; + String ks = kmp.getKMer(); + if (ks.equals(kmerString)) { + kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); + } else { + kmerArray[newCount++] = kmp; + kmerString = ks; + } + } + log("Compaction finished, new count is " + newCount); + return newCount; + } + + private void compactKMers(List kmerList) { + int kmerCount = kmerList.size(); + if (kmerCount <= 1) { + return; + } + StringKMerPosition[] kmerArray = + kmerList.toArray(new StringKMerPosition[kmerCount]); + kmerCount = compactKMers(kmerArray, kmerCount); + kmerList.clear(); + for (int i = 0; i < kmerCount; i++) { + kmerList.add(kmerArray[i]); + } + } + + private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { + int uniqueCount = 0; + for (int i = 0; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { + kmerArray[uniqueCount++] = kmp; + } + } + return uniqueCount; + } + + private int countUniqueKMers(List kmerList) { + int uniqueCount = 0; + for (StringKMerPosition kmp : kmerList) { + if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { + uniqueCount++; + } + } + return uniqueCount; + } + + private void spillKMers(KMerPosition[] kmerArray, int kmerCount) + throws IOException { + if (mSpillFileList == null) { + mSpillFileList = new ArrayList(); + } + int fileNumber = mSpillFileList.size() + 1; + log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); + File spillFile = new File(mOutputDirectory, + "spill_" + mK + "_" + fileNumber + ".tmp"); + mSpillFileList.add(spillFile); + writeKMerBinaryFile(kmerArray, kmerCount, spillFile); + log("Spill file written"); + } + + private void writeKMerBinaryFile(KMerPosition[] kmerArray, + int kmerCount, + File outputFile) + throws IOException { + OutputStream outputStream = + new BufferedOutputStream(new FileOutputStream(outputFile)); + for (int i = 0; i < kmerCount; i++) { + KMerPosition kmp = kmerArray[i]; + writeKMerPosition(outputStream, kmerArray[i]); + } + outputStream.flush(); + outputStream.close(); + } + + private void writeExceptionFile(List kmerList, + File outputFile) + throws IOException { + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + for (StringKMerPosition kmer : kmerList) { + writeUniqueKMer(kmer, writer); + } + writer.flush(); + writer.close(); + } + + private KMerPosition readKMerPosition(InputStream stream) + throws IOException { + byte[] buffer = mIOBuffer; + int encodingLength = (mK + 7)/8; + int fileLength = 4 + 2*encodingLength; + int count = readFully(stream, buffer, 0, fileLength); + if (count <= 0) { + return null; + } else if (count != fileLength) { + throw new RuntimeException("Unexpected end of file"); + } + char[] encoding = new char[encodingLength]; + int baseIndex = ((buffer[0] & 0xFF) | + (buffer[1] & 0xFF) << 8 | + (buffer[2] & 0xFF) << 16 | + (buffer[3] & 0xFF) << 24); + for (int i = 0; i < encodingLength; i++) { + encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | + ((buffer[2*i+5] & 0xFF) << 8)); + } + return new KMerPosition(encoding, baseIndex); + } + + private int readFully(InputStream stream, byte[] buffer, int offset, int count) + throws IOException { + int readCount = 0; + while (readCount < count) { + int read = stream.read(buffer, offset, count-readCount); + if (read <= 0) { + break; + } + offset += read; + readCount += read; + } + return readCount; + } + + private void skipBytes(InputStream stream, int count) + throws IOException { + + long longCount = count; + long skipCount = 0; + while (skipCount < longCount) { + long skipped = stream.skip(longCount - skipCount); + if (skipped <= 0) { + throw new RuntimeException("Skip failed"); + } + skipCount += skipped; + } + } + + private void writeKMerPosition(OutputStream stream, KMerPosition kmer) + throws IOException { + byte[] buffer = mIOBuffer; + int baseIndex = kmer.getBaseIndex(); + char[] encoding = kmer.getKMerEncoding(); + int offset = 0; + buffer[offset++] = (byte) ((baseIndex) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); + buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); + for (int i = 0; i < encoding.length; i++) { + buffer[offset++] = (byte) ((encoding[i]) & 0xFF); + buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); + } + stream.write(buffer, 0, offset); + } + + private long mergeSpillFiles(List spillFiles, File outputFile) + throws IOException { + + if (spillFiles == null) { + return 0; + } + + log("Merging spill files ..."); + OutputStream outputStream = + new BufferedOutputStream(new FileOutputStream(outputFile)); + long uniqueCount = 0; + int fileCount = spillFiles.size(); + InputStream[] inputStreams = new InputStream[fileCount]; + KMerPosition[] kmers = new KMerPosition[fileCount]; + for (int i = 0; i < fileCount; i++) { + inputStreams[i] = + new BufferedInputStream(new FileInputStream(spillFiles.get(i))); + } + while (true) { + for (int i = 0; i < fileCount; i++) { + if (kmers[i] == null && inputStreams[i] != null) { + kmers[i] = readKMerPosition(inputStreams[i]); + if (kmers[i] == null) { + inputStreams[i].close(); + inputStreams[i] = null; + } + } + } + int count = 0; + KMerPosition kmer = null; + for (int i = 0; i < fileCount; i++) { + KMerPosition kmp = kmers[i]; + if (kmp == null) { + continue; + } else if (kmer == null) { + kmer = kmp; + count = 1; + } else { + int cmp = kmp.compareTo(kmer); + if (cmp == 0) { + count++; + } else if (cmp < 0) { + kmer = kmp; + count = 1; + } + } + } + if (kmer == null) { + break; + } + for (int i = 0; i < fileCount; i++) { + if (kmers[i] != null && kmer.compareTo(kmers[i]) == 0) { + kmers[i] = null; + } + } + if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { + uniqueCount++; + writeKMerPosition(outputStream, kmer); + } + } + outputStream.flush(); + outputStream.close(); + for (int i = 0; i < fileCount; i++) { + // spillFiles.get(i).delete(); + } + log("Spill files merged, unique count is " + uniqueCount); + return uniqueCount; + } + + private void writeKMerTextFile(File inputFile, + List exceptionList, + File outputFile) + throws IOException { + + log("Writing kmer file " + outputFile + " ..."); + int exceptionIndex = 0; + StringKMerPosition excKMer = null; + Iterator excIter = null; + if (!exceptionList.isEmpty()) { + excIter = exceptionList.iterator(); + excKMer = excIter.next(); + } + + InputStream inputStream = + new BufferedInputStream(new FileInputStream(inputFile)); + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + KMerPosition kmer = readKMerPosition(inputStream); + while (kmer != null || excKMer != null) { + if (excKMer == null) { + writeUniqueKMer(kmer, writer); + kmer = readKMerPosition(inputStream); + } else if (kmer == null) { + writeUniqueKMer(excKMer, writer); + excKMer = excIter.hasNext() ? excIter.next() : null; + } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { + writeUniqueKMer(kmer, writer); + kmer = readKMerPosition(inputStream); + } else { + writeUniqueKMer(excKMer, writer); + excKMer = excIter.hasNext() ? excIter.next() : null; + } + } + inputStream.close(); + writer.flush(); + writer.close(); + log("Wrote kmer file: " + outputFile); + } + + private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { + if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { + writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); + } + } + + private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { + if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { + writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); + } + } + + private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { + String chr = getBaseIndexSequenceName(baseIndex); + int pos = getBaseIndexCoordinate(baseIndex); + writer.println(kmer + "\t" + chr + "\t" + pos); + } + + private void createMapFile(long mapSize, + File kmerFile, + List exceptionList, + File priorMapFile, + File mapFile) + throws IOException { + byte[] map = null; + long uniquePriorCount = 0; + long byteSize = (mapSize + 7)/8; + int mapByteSize = (int) byteSize; + if (mapByteSize != byteSize) { + throw new RuntimeException("Map too large: " + mapSize); + } + if (priorMapFile.exists()) { + map = readMapFile(priorMapFile); + if (map.length != mapByteSize) { + throw new RuntimeException("Prior map is wrong size"); + } + // Count the prior unique positions + for (int i = 0; i < mapByteSize; i++) { + uniquePriorCount += Integer.bitCount(map[i] & 0xFF); + } + } else { + map = new byte[mapByteSize]; + } + for (StringKMerPosition kmp : exceptionList) { + addToMap(kmp, map); + } + mPriorMapUniqueCount = uniquePriorCount; + + InputStream inputStream = + new BufferedInputStream(new FileInputStream(kmerFile)); + while (true) { + KMerPosition kmp = readKMerPosition(inputStream); + if (kmp == null) { + inputStream.close(); + break; + } + addToMap(kmp, map); + } + + writeMapFile(map, mapFile); + } + + private void addToMap(KMerPosition kmp, byte[] map) { + int baseIndex = kmp.getBaseIndex(); + if (baseIndex != NONUNIQUE_MARKER) { + addToMap(baseIndex, map); + } + } + + private void addToMap(StringKMerPosition kmp, byte[] map) { + int baseIndex = kmp.getBaseIndex(); + if (baseIndex != NONUNIQUE_MARKER) { + addToMap(baseIndex, map); + } + } + + private void addToMap(int baseIndex, byte[] map) { + int mod = baseIndex & 0x7; + int offset = (baseIndex >> 3) & 0x1FFFFFFF; + if ((map[offset] & (1 << mod)) != 0) { + throw new RuntimeException("Map entry already set: " + baseIndex); + } + map[offset] |= (1 << mod); + } + + private boolean isUniqueInMap(byte[] map, int baseIndex) { + int mod = baseIndex & 0x7; + int offset = (baseIndex >> 3) & 0x1FFFFFFF; + return ((map[offset] & (1 << mod)) != 0); + } + + private void writeSummaryStatistics(File outputFile) + throws IOException { + PrintWriter writer = + new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); + long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; + long uniqueCount = mUniquePriorCount + mUniqueNewCount; + long nonUniqueCount = mKMerCount - uniqueCount; + writer.println("K: " + mK); + writer.println("Sequences: " + mSequenceList.size()); + writer.println("Bases: " + baseCount); + writer.println("KMers: " + mKMerCount); + writer.println("Prior map count: " + mPriorMapUniqueCount); + writer.println("Unique prior: " + mUniquePriorCount + + " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); + writer.println("Unique new: " + mUniqueNewCount + + " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); + writer.println("Unique cumulative: " + uniqueCount + + " (" + formatPercent(uniqueCount, mKMerCount) + ")"); + writer.println("Nonunique: " + nonUniqueCount + + " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); + writer.flush(); + writer.close(); + } + + private String formatPercent(long numerator, long denominator) { + double fraction = 0.0; + if (denominator != 0) { + fraction = numerator / (double) denominator; + } + return String.format("%1.1f%%", fraction * 100.0); + } + + private void openPriorMap(File mapFile) + throws IOException { + if (mapFile.exists()) { + mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); + mPriorMapPosition = -1; + mPriorMapValue = 0; + } + } + + private void closePriorMap() + throws IOException { + if (mPriorMapStream != null) { + mPriorMapStream.close(); + } + mPriorMapStream = null; + mPriorMapPosition = -1; + mPriorMapValue = 0; + } + + private byte[] readMapFile(File file) + throws IOException { + long fileLength = file.length(); + if (fileLength > 1000000000) { + throw new RuntimeException("Prior map too large: " + file); + } + int length = (int) fileLength; + byte[] map = new byte[length]; + FileInputStream stream = new FileInputStream(file); + int count = readFully(stream, map, 0, length); + if (count != length) { + throw new RuntimeException("Failed to read map: " + file); + } + stream.close(); + return map; + } + + /** + * Read just a subset of a map file. + */ + private byte[] readMapFileRegion(File file, int offset, int length) + throws IOException { + byte[] map = new byte[length]; + FileInputStream stream = new FileInputStream(file); + skipBytes(stream, offset); + int count = readFully(stream, map, 0, length); + if (count != length) { + throw new RuntimeException("Failed to read map: " + file); + } + stream.close(); + return map; + } + + private void writeMapFile(byte[] map, File file) + throws IOException { + FileOutputStream stream = new FileOutputStream(file); + stream.write(map); + stream.flush(); + stream.close(); + } + + private boolean isUniqueInPriorMap(int baseIndex) + throws IOException { + if (mPriorMapStream == null) { + return false; + } + int byteOffset = (baseIndex >> 3) & 0x1FFFFFFF; + if (byteOffset != mPriorMapPosition) { + int delta = byteOffset - mPriorMapPosition; + if (delta < 0) { + throw new RuntimeException("Attempt to seek backwards in prior map"); + } + if (delta > 1) { + skipFully(mPriorMapStream, delta-1); + } + mPriorMapValue = mPriorMapStream.read(); + if (mPriorMapValue < 0) { + throw new RuntimeException("Unexpected end of file in prior map"); + } + mPriorMapPosition += delta; + } + int mod = baseIndex & 0x7; + return (((1 << mod) & mPriorMapValue) != 0); + } + + private void skipFully(InputStream stream, long amount) + throws IOException { + while (amount > 0) { + long skip = stream.skip(amount); + if (skip <= 0 || skip > amount) { + throw new RuntimeException("Skip failed"); + } + amount -= skip; + } + } + + private String getBaseIndexSequenceName(int baseIndex) { + int sequenceCount = mSequenceList.size(); + for (int i = 0; i < sequenceCount-1; i++) { + int nextOffset = mSequenceOffsetList.get(i+1); + if (compareBaseIndex(nextOffset, baseIndex) > 0) { + return mSequenceList.get(i); + } + } + return mSequenceList.get(sequenceCount-1); + } + + private int getBaseIndexCoordinate(int baseIndex) { + Integer sequenceOffset = null; + for (Integer offset : mSequenceOffsetList) { + if (compareBaseIndex(offset, baseIndex) > 0) { + break; + } + sequenceOffset = offset; + } + if (sequenceOffset == null) { + return 0; + } + int coordinate = baseIndex - sequenceOffset + 1; + if (coordinate <= 0) { + dumpSequenceList(); + System.out.println("coordinate: " + coordinate); + System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); + System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); + throw new RuntimeException("Internal error: illegal coordinate " + + coordinate + " for base index " + baseIndex); + } + return coordinate; + } + + private void dumpSequenceList() { + System.out.println("# Sequences:"); + int count = mSequenceList.size(); + for (int i = 0; i < count; i++) { + String seqName = mSequenceList.get(i); + int offset = mSequenceOffsetList.get(i); + System.out.println("# " + seqName + + "\t" + offset + + "\t" + Integer.toHexString(offset)); + } + } + + private int compareBaseIndex(int baseIndex1, int baseIndex2) { + // Implements unsigned comparison, a la compareTo + if (baseIndex1 < 0 ^ baseIndex2 < 0) { + return ((baseIndex1 < 0) ? 1 : -1); + } else { + return (baseIndex1 - baseIndex2); + } + } + + private String getNextSequence() + throws IOException { + + while (mNextSequence == null) { + if (mCurrentReader == null) { + mCurrentReader = getNextReader(); + if (mCurrentReader == null) { + return null; + } + } + String line = mCurrentReader.readLine(); + if (line == null) { + mCurrentReader.close(); + mCurrentReader = null; + continue; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + } + } + String result = mNextSequence; + mNextSequence = null; + return result; + } + + private LineNumberReader getNextReader() + throws IOException { + if (mInputFileIndex >= mInputFiles.size()) { + return null; + } + File file = mInputFiles.get(mInputFileIndex++); + return new LineNumberReader(new FileReader(file)); + } + + private char[] getNextKMer() + throws IOException { + + if (mKMerBuffer == null) { + mKMerBuffer = new char[mK]; + } + System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); + if (mKMerBufferedCount > 0) { + mKMerBufferedCount--; + } + + while (mKMerBufferedCount < mK) { + char base = getNextBase(); + if (base == 0) { + incrementBaseIndex(mKMerBufferedCount); + mKMerBufferedCount = 0; + return null; + } else if (base == 'N') { + incrementBaseIndex(mKMerBufferedCount+1); + mKMerBufferedCount = 0; + } else { + mKMerBuffer[mKMerBufferedCount++] = base; + } + } + incrementBaseIndex(1); + return mKMerBuffer; + } + + private char getNextBase() + throws IOException { + + if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { + if (mCurrentReader == null) { + return 0; + } + String line = mCurrentReader.readLine(); + if (line == null) { + mLineBuffer = null; + mLineBufferIndex = 0; + mCurrentReader.close(); + mCurrentReader = null; + return 0; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + mLineBuffer = null; + mLineBufferIndex = 0; + return 0; + } + mLineBuffer = line.toUpperCase(); + mLineBufferIndex = 0; + } + return mLineBuffer.charAt(mLineBufferIndex++); + } + + private void incrementBaseIndex(int amount) { + if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { + throw new RuntimeException("Base index: 32-bit overflow"); + } + mBaseIndex += amount; + } + + private void log(String text) { + if (mVerbose) { + System.out.println("# " + new Date() + " " + text); + } + } + + private void debug(String text) { + if (mDebug) { + System.out.println("# " + new Date() + " " + text); + } + } + + private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { + char[] encoding = encodeKMerChars(kmerChars); + if (encoding == null) { + return null; + } + char[] reverseEncoding = encodeKMerChars(reverseComplement(kmerChars)); + if (compareEncodings(encoding, reverseEncoding) <= 0) { + return new KMerPosition(encoding, baseIndex); + } else { + KMerPosition kmp = new KMerPosition(reverseEncoding, baseIndex); + kmp.setIsReversed(true); + return kmp; + } + } + + private static char[] encodeKMerChars(char[] kmerChars) { + if (kmerChars == null) { + return null; + } + + int kmerLength = kmerChars.length; + int encodingLength = (kmerLength + 7) / 8; + char[] encoding = new char[encodingLength]; + int offset = kmerLength % 8; + offset = (offset == 0) ? 8 : offset; + int bits = encodeKMerBits(kmerChars, 0, offset); + if (bits < 0) { + return null; + } + encoding[0] = (char) bits; + for (int i = 1; i < encodingLength; i++) { + bits = encodeKMerBits(kmerChars, offset, 8); + if (bits < 0) { + return null; + } + encoding[i] = (char) bits; + offset += 8; + } + return encoding; + } + + private static int compareEncodings(char[] encoding1, char[] encoding2) { + int length = Math.max(encoding1.length, encoding2.length); + for (int i = 0; i < length; i++) { + int result = encoding1[i] - encoding2[i]; + if (result != 0) { + return result; + } + } + return 0; + } + + private static int encodeKMerBits(char[] kmerChars, int offset, int length) { + int bits = 0; + for (int i = 0; i < length; i++) { + char base = kmerChars[offset + i]; + int baseBits = "ACGT".indexOf(base); + if (baseBits < 0) { + return -1; + } + bits |= baseBits << (2*(length-i-1)); + } + return bits; + } + + private static String decodeKMer(char[] encoding, boolean reverse) { + int length = mK; + char[] buffer = new char[length]; + int offset = length % 8; + offset = (offset == 0) ? 8 : offset; + decodeKMerBits(encoding[0], buffer, 0, offset); + for (int i = 1; i < encoding.length; i++) { + decodeKMerBits(encoding[i], buffer, offset, 8); + offset += 8; + } + if (reverse) { + reverseComplementInPlace(buffer); + } + return new String(buffer); + } + + private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { + for (int i = 0; i < length; i++) { + int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); + buffer[offset + i] = "ACGT".charAt(baseBits); + } + } + + private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { + for (int i = 0; i < length; i++) { + int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); + buffer[offset + i] = "ACGT".charAt(baseBits); + } + } + + private static char[] reverseComplement(char[] buffer) { + int length = buffer.length; + char[] result = new char[length]; + System.arraycopy(buffer, 0, result, 0, length); + reverseComplementInPlace(result); + return result; + } + + private static void reverseComplementInPlace(char[] buffer) { + int length = buffer.length; + int limit = (length + 1)/2; + for (int i = 0; i < limit; i++) { + char ch1 = reverseComplement(buffer[i]); + char ch2 = reverseComplement(buffer[length-i-1]); + buffer[i] = ch2; + buffer[length-i-1] = ch1; + } + } + + private static char reverseComplement(char base) { + switch (base) { + case 'A': + return 'T'; + case 'C': + return 'G'; + case 'G': + return 'C'; + case 'T': + return 'A'; + } + return base; + } + + private static String formatEncoding(char[] encoding) { + if (encoding == null) { + return null; + } + StringBuilder builder = new StringBuilder(); + builder.append('['); + for (int i = 0; i < encoding.length; i++) { + String hex = Integer.toHexString(encoding[i]); + int length = hex.length(); + while (length < 4) { + builder.append('0'); + length++; + } + builder.append(hex); + } + builder.append(']'); + return builder.toString(); + } + + static class KMerPosition + implements Comparable { + + private int mBaseIndex; + private boolean mReversed; + private char[] mKMerEncoding; + + KMerPosition(char[] encoding, int baseIndex) { + mBaseIndex = baseIndex; + mReversed = false; + mKMerEncoding = encoding; + } + + public final String getKMer() { + return decodeKMer(mKMerEncoding, mReversed); + } + + public final boolean getIsReversed() { + return mReversed; + } + + public final void setIsReversed(boolean value) { + mReversed = value; + } + + public final int getBaseIndex() { + return mBaseIndex; + } + + public final void setBaseIndex(int baseIndex) { + mBaseIndex = baseIndex; + } + + public final char[] getKMerEncoding() { + return mKMerEncoding; + } + + public int compareTo(KMerPosition kmp) { + return compareEncodings(getKMerEncoding(), kmp.getKMerEncoding()); + } + + public boolean equals(Object object) { + if (!(object instanceof KMerPosition)) { + return false; + } + KMerPosition kmp = (KMerPosition) object; + return (getBaseIndex() == kmp.getBaseIndex() && + this.compareTo(kmp) == 0); + } + + public String format() { + return(getKMer() + + " " + formatEncoding(getKMerEncoding()) + + " " + (mReversed ? 'R' : 'F') + + " " + Integer.toHexString(mBaseIndex)); + } + } + + static class StringKMerPosition + implements Comparable { + + private String mKMerString = null; + private int mBaseIndex; + + StringKMerPosition(String kmer, int baseIndex) { + mKMerString = kmer; + mBaseIndex = baseIndex; + } + + public final String getKMer() { + return mKMerString; + } + + public final int getBaseIndex() { + return mBaseIndex; + } + + public final void setBaseIndex(int baseIndex) { + mBaseIndex = baseIndex; + } + + public int compareTo(StringKMerPosition kmp) { + return mKMerString.compareTo(kmp.mKMerString); + } + + public boolean equals(Object object) { + if (!(object instanceof StringKMerPosition)) { + return false; + } + StringKMerPosition kmp = (StringKMerPosition) object; + return (mBaseIndex == kmp.mBaseIndex && + mKMerString.equals(kmp.mKMerString)); + } + } +} diff --git a/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java b/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java new file mode 100644 index 0000000000..90b26d0b1f --- /dev/null +++ b/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java @@ -0,0 +1,151 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv.kmer; + + +import edu.mit.broad.dcp.DistributedAlgorithm; +import edu.mit.broad.cnv.util.SequenceIterator; + +import java.io.*; +import java.util.*; + + +/** + * Distributed algorithm for counting unique kmers. + */ +public class DistributedKMerCounter + extends DistributedAlgorithm +{ + private boolean mDebug = false; + private boolean mVerbose = false; + private int mK = 0; + private List mInputFiles = null; + private List mSequenceList = null; + private List mSequenceOffsetList = null; + + + public DistributedKMerCounter() { + } + + public boolean getDebug() { + return mDebug; + } + + public void setDebug(boolean value) { + mDebug = value; + } + + public boolean getVerbose() { + return mVerbose; + } + + public void setVerbose(boolean value) { + mVerbose = value; + } + + public int getK() { + return mK; + } + + public void setK(int value) { + mK = value; + } + + public List getInputFiles() { + return mInputFiles; + } + + public void setInputFiles(List value) { + mInputFiles = value; + } + + public void run() + throws Exception { + super.run(); + finish(); + } + + protected void init() + throws Exception { + if (getWorkerId() == MASTER) { + initMaster(); + } else { + initWorker(); + } + } + + private void initMaster() + throws IOException { + // Tasks to be amortized + report("Scanning sequences ..."); + scanSequences(); + report("Scan complete."); + } + + private void initWorker() { + // Tasks to be amortized + } + + protected void start() { + // scan genome, divide into chromosomes and optionally segments, distribute calls + } + + private void finish() { + // merge individual files, write out final results + } + + private void scanSequences() + throws IOException { + List sequenceList = new ArrayList(); + List sequenceOffsetList = new ArrayList(); + SequenceIterator seqIterator = new SequenceIterator(getInputFiles()); + while (true) { + String seqName = seqIterator.getNextSequence(); + if (seqName == null) { + break; + } + int baseIndex = seqIterator.getBaseIndex() + 1; + sequenceList.add(seqName); + sequenceOffsetList.add(baseIndex); + } + mSequenceList = sequenceList; + mSequenceOffsetList = sequenceOffsetList; + } + + // Currently not used + private void loadGenomeOffsets(File file) + throws IOException { + List sequenceList = new ArrayList(); + List sequenceOffsetList = new ArrayList(); + int baseIndex = 0; + LineNumberReader reader = new LineNumberReader(new FileReader(file)); + while (true) { + String line = reader.readLine(); + if (line == null) { + break; + } + String text = line.trim(); + if (text.length() == 0 || text.startsWith("#")) { + continue; + } + String[] fields = text.split("\\s+"); + if (fields.length != 2) { + throw new RuntimeException("Invalid input line: " + line); + } + int length = Integer.parseInt(fields[1]); + sequenceList.add(fields[0]); + sequenceOffsetList.add(baseIndex); + baseIndex += length; + } + mSequenceList = sequenceList; + mSequenceOffsetList = sequenceOffsetList; + } +} diff --git a/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java b/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java new file mode 100644 index 0000000000..7ed22faf3d --- /dev/null +++ b/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java @@ -0,0 +1,184 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv.util; + + +import java.io.*; +import java.util.*; + + +/** + * Utility class for transforming between a linear base index + * and a chromsome + position coordinate system. + */ +public class GenomeBaseIndex { + + private List mSequenceNames = null; + private int[] mLengths = null; + private long[] mOffsets = null; + + private GenomeBaseIndex() { + } + + public static GenomeBaseIndex read(File file) + throws IOException { + Reader reader = new BufferedReader(new FileReader(file)); + try { + return read(reader); + } finally { + reader.close(); + } + } + + // The input is just a list of space-delimited sequence name and length. + public static GenomeBaseIndex read(Reader reader) + throws IOException { + List sequenceNames = new ArrayList(); + List sequenceLengths = new ArrayList(); + BufferedReader bufferedReader = new BufferedReader(reader); + while (true) { + String line = bufferedReader.readLine(); + if (line == null) { + break; + } + String text = line.trim(); + if (text.length() == 0 || text.startsWith("#")) { + continue; + } + String[] fields = text.split("\\s+"); + if (fields.length < 2) { + throw new RuntimeException("Invalid input line: " + line); + } + int length = Integer.parseInt(fields[1]); + if (length <= 0) { + throw new RuntimeException("Invalid sequence length: " + length); + } + sequenceNames.add(fields[0]); + sequenceLengths.add(length); + } + int count = sequenceLengths.size(); + int[] lengths = new int[count]; + long[] offsets = new long[count]; + long offset = 0; + for (int i = 0; i < count; i++) { + lengths[i] = sequenceLengths.get(i); + offsets[i] = offset; + offset += lengths[i]; + } + GenomeBaseIndex result = new GenomeBaseIndex(); + result.mSequenceNames = sequenceNames; + result.mLengths = lengths; + result.mOffsets = offsets; + return result; + } + + public List getSequenceNames() { + return mSequenceNames; + } + + public boolean contains(String seqName) { + return (getSequenceIndex(seqName) >= 0); + } + + public long getFirstIndex(String seqName) { + int index = getSequenceIndex(seqName); + if (index < 0) { + return -1; + } + return mOffsets[index]; + } + + public long getLastIndex(String seqName) { + int index = getSequenceIndex(seqName); + if (index < 0) { + return -1; + } + return (mOffsets[index] + mLengths[index] - 1); + } + + public int getSequenceLength(String seqName) { + int index = getSequenceIndex(seqName); + if (index < 0) { + return 0; + } + return mLengths[index]; + } + + public long getBaseIndex(String seqName, int position) { + int index = getSequenceIndex(seqName); + if (index < 0) { + return -1; + } + if (position > mLengths[index]) { + return -1; + } + if (position < 1) { + // Zero or negative position means last base index + position = mLengths[index]; + } + return (mOffsets[index] + position - 1); + } + + public String getSequenceName(long baseIndex) { + int index = getSequenceIndex(baseIndex); + if (index < 0) { + return null; + } + return mSequenceNames.get(index); + } + + public int getPosition(long baseIndex) { + if (baseIndex < 0) { + // Catch common sign-extension error when packing indexes as ints. + throw new IllegalArgumentException("Invalid base index: " + baseIndex); + } + int index = getSequenceIndex(baseIndex); + if (index < 0) { + return 0; + } + long offset = mOffsets[index]; + long result = baseIndex - offset + 1; + return (int) result; + } + + // Same as getSequenceName, but treat the argument as an unsigned int. + // This is useful for manipulating/storing indexes for the human + // genome as 4-byte unsigned ints. + public String getSequenceNameUnsigned(int baseIndex) { + return getSequenceName(baseIndex & 0xFFFFFFFFL); + } + + // Same as getPosition, but treat the argument as an unsigned int. + // This is useful for manipulating/storing indexes for the human + // genome as 4-byte unsigned ints. + public int getPositionUnsigned(int baseIndex) { + return getPosition(baseIndex & 0xFFFFFFFFL); + } + + private int getSequenceIndex(String seqName) { + return mSequenceNames.indexOf(seqName); + } + + private int getSequenceIndex(long baseIndex) { + long offset = 0; + if (baseIndex < 0) { + return -1; + } + for (int i = 0; i < mLengths.length; i++) { + int length = mLengths[i]; + if (offset + length > baseIndex) { + return i; + } + offset += length; + } + return -1; + } +} diff --git a/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java b/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java new file mode 100644 index 0000000000..2d1a96f616 --- /dev/null +++ b/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java @@ -0,0 +1,167 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv.util; + + +import java.io.*; +import java.util.*; + + +/** + * Utility class for transforming between a chromsome + position + * coordinate system and a binned coordinate system where each + * chromosome (separately) is divided into fixed sized bins, + * ragged on the right/upper end. + */ +public class GenomeBinIndex { + + private int mBinSize; + private List mSequenceNames; + private int[] mSequenceLengths; + private int[] mBinOffsets; + + public GenomeBinIndex(GenomeBaseIndex gbi, int binSize) { + if (binSize <= 0) { + throw new IllegalArgumentException("Illegal bin size: " + binSize); + } + mBinSize = binSize; + mSequenceNames = new ArrayList(gbi.getSequenceNames()); + int count = mSequenceNames.size(); + mSequenceLengths = new int[count]; + mBinOffsets = new int[count]; + long binOffset = 0; // long to detect overflow + for (int i = 0; i < count; i++) { + int length = gbi.getSequenceLength(mSequenceNames.get(i)); + int binCount = (length + binSize - 1) / binSize; + mSequenceLengths[i] = length; + mBinOffsets[i] = (int) binOffset; + binOffset += binCount; + } + if (binOffset > Integer.MAX_VALUE) { + // Check for integer overflow. + // This will happen, e.g., with the human genome and a bin size of 1. + throw new RuntimeException("Binsize too small: " + binSize); + } + } + + public int getBinSize() { + return mBinSize; + } + + public int getBinIndex(String seqName, int position) { + int index = getSequenceIndex(seqName); + if (index < 0) { + return -1; + } + if (position > mSequenceLengths[index]) { + return -1; + } + if (position < 1) { + position = mSequenceLengths[index]; + } + int bin = (position - 1) / mBinSize; + return (mBinOffsets[index] + bin); + } + + public String getSequenceName(int binIndex) { + int index = getSequenceIndex(binIndex); + if (index < 0) { + return null; + } + return mSequenceNames.get(index); + } + + public int getStartPosition(int binIndex) { + int index = getSequenceIndex(binIndex); + if (index < 0) { + return -1; + } + int bin = binIndex - mBinOffsets[index]; + return (bin * mBinSize + 1); + } + + public int getEndPosition(int binIndex) { + int index = getSequenceIndex(binIndex); + if (index < 0) { + return -1; + } + int bin = binIndex - mBinOffsets[index]; + int position = (bin+1) * mBinSize; + position = Math.min(position, mSequenceLengths[index]); + return position; + } + + public List getSequenceNames() { + return mSequenceNames; + } + + public int getFirstBin(String seqName) { + return getBinIndex(seqName, 1); + } + + public int getLastBin(String seqName) { + return getBinIndex(seqName, 0); + } + + public int getBinCount() { + if (mBinOffsets.length == 0) { + return 0; + } + int lastIndex = mBinOffsets.length - 1; + int count = mBinOffsets[lastIndex]; + count += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize; + return count; + } + + public int getBinCount(String seqName) { + int index = getSequenceIndex(seqName); + if (index < 0) { + return -1; + } + return ((mSequenceLengths[index] + mBinSize - 1) / mBinSize); + } + + public int getSequenceLength(String seqName) { + int index = getSequenceIndex(seqName); + if (index < 0) { + return 0; + } + return mSequenceLengths[index]; + } + + private int getSequenceIndex(String seqName) { + for (int i = 0; i < mSequenceNames.size(); i++) { + if (mSequenceNames.get(i).equals(seqName)) { + return i; + } + } + return -1; + } + + private int getSequenceIndex(int binIndex) { + if (binIndex < 0) { + return -1; + } + for (int i = 1; i < mBinOffsets.length; i++) { + if (mBinOffsets[i] > binIndex) { + return i-1; + } + } + int lastIndex = mBinOffsets.length-1; + int lastBinIndex = mBinOffsets[lastIndex]; + lastBinIndex += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize; + if (binIndex <= lastBinIndex) { + return lastIndex; + } + return -1; + } +} + diff --git a/lib/edu/mit/broad/cnv/util/SequenceIterator.java b/lib/edu/mit/broad/cnv/util/SequenceIterator.java new file mode 100644 index 0000000000..57bbae7a54 --- /dev/null +++ b/lib/edu/mit/broad/cnv/util/SequenceIterator.java @@ -0,0 +1,145 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.cnv.util; + + +import java.io.*; +import java.util.*; + + +/** + * Utility class for iterating over fasta files. + * Also maintains an unsigned base index over the file set. + */ +public class SequenceIterator +{ + private List mInputFiles = null; + private int mInputFileIndex = 0; + private int mBaseIndex = -1; + private LineNumberReader mCurrentReader = null; + private String mNextSequence = null; + private String mLineBuffer = null; + private int mLineBufferIndex = 0; + + public SequenceIterator(File inputFile) { + mInputFiles = new ArrayList(); + mInputFiles.add(inputFile); + } + + public SequenceIterator(List inputFiles) { + mInputFiles = inputFiles; + } + + public void close() { + if (mCurrentReader != null) { + try { + mCurrentReader.close(); + } catch (IOException exc) { + throw new RuntimeException("Error closing reader: " + exc.getMessage(), + exc); + } + } + mCurrentReader = null; + mInputFiles = null; + mInputFileIndex = 0; + mBaseIndex = -1; + mNextSequence = null; + mLineBuffer = null; + mLineBufferIndex = 0; + } + + public String getNextSequence() + throws IOException { + + while (mNextSequence == null) { + if (mLineBuffer != null) { + incrementBaseIndex(mLineBuffer.length() - mLineBufferIndex); + mLineBuffer = null; + mLineBufferIndex = 0; + } + if (mCurrentReader == null) { + mCurrentReader = getNextReader(); + if (mCurrentReader == null) { + return null; + } + } + String line = mCurrentReader.readLine(); + if (line == null) { + mCurrentReader.close(); + mCurrentReader = null; + continue; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + } else { + incrementBaseIndex(line.length()); + } + } + String result = mNextSequence; + mNextSequence = null; + return result; + } + + public char getNextBase() + throws IOException { + + if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { + if (mCurrentReader == null) { + return 0; + } + if (mNextSequence != null) { + return 0; + } + String line = mCurrentReader.readLine(); + if (line == null) { + mLineBuffer = null; + mLineBufferIndex = 0; + mCurrentReader.close(); + mCurrentReader = null; + return 0; + } + if (line.startsWith(">")) { + String[] tokens = line.substring(1).trim().split("\\s+"); + mNextSequence = tokens[0]; + mLineBuffer = null; + mLineBufferIndex = 0; + return 0; + } + mLineBuffer = line.toUpperCase(); + mLineBufferIndex = 0; + } + char result = mLineBuffer.charAt(mLineBufferIndex++); + incrementBaseIndex(1); + return result; + } + + public int getBaseIndex() { + return mBaseIndex; + } + + private LineNumberReader getNextReader() + throws IOException { + if (mInputFileIndex >= mInputFiles.size()) { + return null; + } + File file = mInputFiles.get(mInputFileIndex++); + return new LineNumberReader(new FileReader(file)); + } + + private void incrementBaseIndex(int amount) { + if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { + throw new RuntimeException("Base index: 32-bit overflow"); + } + mBaseIndex += amount; + } +} + diff --git a/lib/edu/mit/broad/dcp/CallStatus.java b/lib/edu/mit/broad/dcp/CallStatus.java new file mode 100644 index 0000000000..e431b27dfe --- /dev/null +++ b/lib/edu/mit/broad/dcp/CallStatus.java @@ -0,0 +1,18 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2007 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp; + +public enum CallStatus +{ + PENDING, + PROCESSING +} + + diff --git a/lib/edu/mit/broad/dcp/CommandRunner.java b/lib/edu/mit/broad/dcp/CommandRunner.java new file mode 100644 index 0000000000..b93b310dd6 --- /dev/null +++ b/lib/edu/mit/broad/dcp/CommandRunner.java @@ -0,0 +1,309 @@ +/** + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2006 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp; + +import java.io.*; + + +/** + * Utility class to run system commands synchronously and return the output. + * + * The interface supports the typical case where you want to return a modest + * amount of information from the command's standard output or standard error + * as a string. The caller can override this behavior, however, and provide + * alternative output destinations if necessary. + * + * If setMergeOutput() is true, then this class will attempt to interleave + * the standard output and standard error streams of the command into one + * stream (standard output). This may not produce exactly the same results + * as having the operating system interleave the output, but works well for + * simple executables that do not heavily intermix stdout and stderr. + * + * A typical invocation is: + *
+ *  CommandRunner runner = new CommandRunner();
+ *  int status = runner.runCommand("ls");
+ *  if (status == 0) {
+ *      System.out.print(runner.getStandardOutput());
+ *  }
+ * 
+ * + * @author Bob Handsaker + */ +public class CommandRunner { + + private boolean mMergeOutput = false; + private Writer mStandardOutputDestination = null; + private Writer mStandardErrorDestination = null; + private String mStandardOutputString = null; + private String mStandardErrorString = null; + + + /** + * Default constructor. + */ + public CommandRunner() { + } + + /** + * Get the standard output from the last command as a string. + * + * If no command has been run or an explicit output destination + * was set, then this method returns null. + */ + public String getStandardOutputString() { + return mStandardOutputString; + } + + /** + * Get the standard error from the last command as a string. + * + * If no command has been run or an explicit output destination + * was set, then this method returns null. + */ + public String getStandardErrorString() { + return mStandardErrorString; + } + + /** + * If true, the command's standard error stream will be interleaved + * with the command's standard output stream. The standard error + * stream destination will not be used. + */ + public boolean getMergeOutput() { + return mMergeOutput; + } + + /** + * If true, the command's standard error stream will be interleaved + * with the command's standard output stream. + */ + public void setMergeOutput(boolean value) { + mMergeOutput = value; + } + + /** + * The destination for the command's standard output stream. + * If null, the standard output will be captured in a string. + */ + public Writer getStandardOutputDestination() { + return mStandardOutputDestination; + } + + /** + * The destination for the command's standard output stream. + * If set to null, the standard output will be captured in a string. + */ + public void setStandardOutputDestination(Writer writer) { + mStandardOutputDestination = writer; + } + + /** + * The destination for the command's standard error stream. + * If null, the standard error will be captured in a string. + */ + public Writer getStandardErrorDestination() { + return mStandardErrorDestination; + } + + /** + * The destination for the command's standard error stream. + * If set to null, the standard error will be captured in a string. + */ + public void setStandardErrorDestination(Writer writer) { + mStandardErrorDestination = writer; + } + + /** + * Run a command string as a system command. + * + * Returns the exit status of the command. + * + * When this method is called, the standard output string + * and standard error string are updated if no alternative output + * destinations have been set. + * + * This method throws a RuntimeException if running the command fails + * (for example, if there are not enough system resources to spawn + * the process). + * + * @param commmand The command string to run. + * @return Command exit status. + * @throws RuntimeException If command execution fails. + */ + public int runCommand(String command) + throws RuntimeException { + return runCommand(command.split(" "), null, null); + } + + /** + * Run a command string as a system command. + * + * Returns the exit status of the command. + * + * When this method is called, the standard output string + * and standard error string are updated if no alternative output + * destinations have been set. + * + * This method throws a RuntimeException if running the command fails + * (for example, if there are not enough system resources to spawn + * the process). + * + * @param commmand The command string to run. + * @param environment The command environment (or null to inherit). + * @param workingDirectory The working directory (or null to inherit). + * @return Command exit status. + * @throws RuntimeException If command execution fails. + */ + public int runCommand(String command, String[] environment, File workingDirectory) + throws RuntimeException { + return runCommand(command.split(" "), environment, workingDirectory); + } + + /** + * Run a command string as a system command. + * + * Returns the exit status of the command. + * + * When this method is called, the standard output string + * and standard error string are updated if no alternative output + * destinations have been set. + * + * This method throws a RuntimeException if running the command fails + * (for example, if there are not enough system resources to spawn + * the process). + * + * @param commmand The command to run (as a array of arguments). + * @param environment The command environment (or null to inherit). + * @param workingDirectory The working directory (or null to inherit). + * @return Command exit status. + * @throws RuntimeException If command execution fails. + */ + public int runCommand(String[] command, String[] environment, File workingDirectory) + throws RuntimeException { + + Writer stdout = mStandardOutputDestination; + Writer stderr = mStandardErrorDestination; + if (stdout == null) { + stdout = new StringWriter(); + } + if (mMergeOutput) { + stderr = stdout; + } else if (stderr == null) { + stderr = new StringWriter(); + } + + mStandardOutputString = null; + mStandardErrorString = null; + + int commandStatus = 0; + try { + Process process = + Runtime.getRuntime().exec(command, environment, workingDirectory); + StreamHandler stdoutHandler = + new StreamHandler(process.getInputStream(), stdout); + StreamHandler stderrHandler = + new StreamHandler(process.getErrorStream(), stderr); + + commandStatus = process.waitFor(); + + // Wait for the streams to drain. + stdoutHandler.join(); + stderrHandler.join(); + } catch (Exception exc) { + throw new RuntimeException("Command execution failed: " + + exc.getMessage(), + exc); + } + + if (mStandardOutputDestination == null) { + mStandardOutputString = stdout.toString(); + } + if (mStandardErrorDestination == null && !mMergeOutput) { + mStandardErrorString = stderr.toString(); + } + + return commandStatus; + } + + + /** + * Internal class to asynchronously read from the standard output + * and standard error streams of the command being executed. + * + * If you do not handle command output asynchronously, then execution + * of a command may block in some environments if the program produces + * too much output. In this case, the call to run the process will + * never complete. + */ + private static class StreamHandler extends Thread { + + /** + * Constructor. + * Create an instance of this class, which is an asynchronous + * thread that will consume input from the given input stream + * and send the output to the given output destination. + * + * @param input The input stream to read. + * @param output The output destination. + */ + StreamHandler(InputStream input, Writer output) { + m_input = input; + m_output = output; + start(); + } + + + /** + * Standard thread run method. + * Pipe input from the input source to the output destination + * until there is no more input left. + * + * If an IOException occurs, the thread will make sure all + * available output has been flushed to the destination and + * then terminate. The IOException is not propagated. + */ + public void run() { + + char[] buffer = new char[4096]; + Reader reader = + new InputStreamReader(new BufferedInputStream(m_input)); + Writer writer = m_output; + + try { + while (true) { + int count = reader.read(buffer); + if (count <= 0) { + break; + } + if (writer != null) { + synchronized (writer) { + writer.write(buffer, 0, count); + } + } + } + } catch (IOException ignore) { + // Ignore IO exceptions + } finally { + try { + reader.close(); + } catch (Exception ignore) { + } + try { + m_output.flush(); + } catch (Exception ignore) { + } + } + } + + private InputStream m_input; + private Writer m_output; + } +} diff --git a/lib/edu/mit/broad/dcp/DistributedAlgorithm.java b/lib/edu/mit/broad/dcp/DistributedAlgorithm.java new file mode 100644 index 0000000000..a223c03264 --- /dev/null +++ b/lib/edu/mit/broad/dcp/DistributedAlgorithm.java @@ -0,0 +1,618 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2007 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp; + +import edu.mit.broad.dcp.message.*; + +import java.io.*; +import java.util.*; +import java.lang.reflect.Method; +import java.net.InetAddress; +import java.net.ServerSocket; +import java.rmi.registry.*; + +/** + * Experimental. + */ +public abstract class DistributedAlgorithm + implements Serializable +{ + public static final Integer ANY = 0; + public static final Integer MASTER = 1; + + public DistributedAlgorithm() { + } + + public String getServerHost() { + return mServerHost; + } + + public void setServerHost(String value) { + mServerHost = value; + } + + public int getServerPort() { + return mServerPort; + } + + public void setServerPort(int value) { + mServerPort = value; + } + + public String getAlgorithmName() { + if (mAlgorithmName != null) { + return mAlgorithmName; + } else { + return getClassName(); + } + } + + public void setAlgorithmName(String value) { + mAlgorithmName = value; + } + + public int getMaximumWorkerCount() { + return mMaximumWorkerCount; + } + + public void setMaximumWorkerCount(int value) { + mMaximumWorkerCount = value; + } + + /** + * Name of LSF queue to use for workers. + */ + public String getLsfQueue() { + return mLsfQueue; + } + + public void setLsfQueue(String value) { + mLsfQueue = value; + } + + /** + * Directory to hold lsf log files. + */ + public String getLsfLogDirectory() { + return mLsfLogDirectory; + } + + public void setLsfLogDirectory(String value) { + mLsfLogDirectory = value; + } + + public boolean getEnableGcLogging() { + return mEnableGcLogging; + } + + public void setEnableGcLogging(boolean value) { + mEnableGcLogging = value; + } + + public Integer getWorkerId() { + return mWorkerId; + } + + public Integer getProcessId() { + return mProcessId; + } + + protected void init() + throws Exception { + } + + protected abstract void start() + throws Exception; + + public void run() + throws Exception { + + if (mIsRunning) { + throw new IllegalStateException("Algorithm is already running"); + } + + mIsRunning = true; + mWorkerId = MASTER; + mProcessId = MASTER; + + try { + startDistributedServer(); + init(); + startWorkerThread(); + startWorkers(); + start(); + waitForCompletion(); + } finally { + // TBD: More cleanup (shutdown threads, etc.) + stopDistributedServer(); + mIsRunning = false; + } + } + + void runWorker(int workerId, int processId) + throws Exception { + + if (mIsRunning) { + throw new IllegalStateException("Algorithm is already running"); + } + + mIsRunning = true; + mWorkerId = workerId; + mProcessId = processId; + + try { + if (openDistributedServer() == null) { + report("Server " + mServerHost + ":" + mServerPort + " not responding"); + return; + } + init(); + startWorkerThread(); + mWorkerThread.join(); + } finally { + closeDistributedServer(); + mIsRunning = false; + } + } + + private void startWorkers() { + int workerCount = getMaximumWorkerCount(); + if (workerCount <= 0) { + // Use single process execution for testing/debugging. + new InProcessWorker().start(); + return; + } + if (workerCount > 1000) { + throw new RuntimeException("Excessive worker count: " + workerCount); + } + for (int i = 0; i < workerCount; i++) { + Integer workerId = (MASTER + i + 1); + Integer processId = workerId; // for now + startWorker(workerId, processId); + } + } + + private void startDistributedServer() { + try { + // Create a server socket to allocate a unique port. + // There is a window of vulnerability where the port + // can get reused, but in practice this works ok. + String serverHost = getCurrentHost(); + ServerSocket socket = new ServerSocket(0); + int serverPort = socket.getLocalPort(); + socket.close(); + Registry registry = LocateRegistry.createRegistry(serverPort); + DistributedCallServer server = new DistributedCallServer(); + server.setAlgorithm(this); + registry.bind("DistributedCallService", server); + mServerHost = serverHost; + mServerPort = serverPort; + mDistributedCallServer = server; + mDistributedCallService = server; + } catch (Exception exc) { + throw wrapException(exc); + } + } + + private void stopDistributedServer() { + if (mDistributedCallServer != null) { + try { + Registry registry = LocateRegistry.getRegistry(mServerPort); + registry.unbind("DistributedCallService"); + mDistributedCallServer.stop(); + } catch (Exception exc) { + throw wrapException(exc); + } + } + mDistributedCallService = null; + mDistributedCallServer = null; + } + + private DistributedCallService openDistributedServer() { + mDistributedCallService = null; + try { + String url = "rmi://" + getServerHost() + ":" + getServerPort() + "/DistributedCallService"; + DistributedCallService server = + (DistributedCallService) java.rmi.Naming.lookup(url); + mDistributedCallService = server; + } catch (java.rmi.NotBoundException exc) { + // Server has exited + } catch (Exception exc) { + throw wrapException(exc); + } + return mDistributedCallService; + } + + private void closeDistributedServer() { + mDistributedCallService = null; + } + + private void startWorker(Integer workerId, Integer processId) { + + String logFile = "worker_" + processId + "_%J.bsub"; + if (mLsfLogDirectory != null) { + logFile = mLsfLogDirectory + "/" + logFile; + } + + List command = new ArrayList(); + command.add("bsub"); + command.add("-o"); + command.add(logFile); + if (mLsfQueue != null) { + command.add("-q"); + command.add(mLsfQueue); + } + command.add("runDistributedWorker"); + command.add("-serverHost"); + command.add(getServerHost()); + command.add("-serverPort"); + command.add(Integer.toString(getServerPort())); + command.add("-workerId"); + command.add(Integer.toString(workerId)); + command.add("-processId"); + command.add(Integer.toString(processId)); + + // Pass our -Xmx setting along to all workers. + Map environment = + new LinkedHashMap(System.getenv()); + long maxMemory = Runtime.getRuntime().maxMemory(); + long maxKbytes = maxMemory / 1024; + String memJavaOpt = "-Xmx" + maxKbytes + "K"; + + // Enable GC logging if requested + String gcJavaOpt = null; + if (mEnableGcLogging) { + String gcLogFile = "worker_" + processId + ".gc.log"; + if (mLsfLogDirectory != null) { + gcLogFile = mLsfLogDirectory + "/" + gcLogFile; + } + gcJavaOpt = "-Xloggc:" + gcLogFile; + } + + String javaOpts = environment.get("JAVAOPTS"); + if (javaOpts == null) { + javaOpts = memJavaOpt; + if (gcJavaOpt != null) { + javaOpts = javaOpts + " " + gcJavaOpt; + } + environment.put("JAVAOPTS", javaOpts); + } + + // Log output ourselves (rather than waiting for bsub). + String workerLogFile = "worker_" + processId + ".log"; + if (mLsfLogDirectory != null) { + workerLogFile = mLsfLogDirectory + "/" + workerLogFile; + } + environment.put("DA_LOG_FILE", workerLogFile); + + CommandRunner runner = new CommandRunner(); + Writer output = new LsfOutputFilter(); + runner.setStandardOutputDestination(output); + runner.setStandardErrorDestination(output); + String[] commandArray = command.toArray(new String[command.size()]); + String[] environmentArray = createEnvironmentArray(environment); + int status = runner.runCommand(commandArray, environmentArray, null); + if (status != 0) { + throw new RuntimeException("Error starting worker: " + status); + } + } + + private String[] createEnvironmentArray(Map map) { + if (map == null) { + return null; + } + int index = 0; + String[] array = new String[map.size()]; + for (Map.Entry entry : map.entrySet()) { + array[index++] = entry.getKey() + "=" + entry.getValue(); + } + return array; + } + + private String getCurrentHost() { + try { + return InetAddress.getLocalHost().getCanonicalHostName(); + } catch (Exception exc) { + throw wrapException(exc); + } + } + + private void waitForCompletion() { + DistributedCallServer server = mDistributedCallServer; + while (true) { + if (server.isQueueEmpty()) { + break; + } + try { + Thread.sleep(1000); + } catch (InterruptedException exc) { + // ignore + } + } + } + + protected void callDistributed(String methodName, Object... methodArgs) { + callDistributed(null, methodName, methodArgs); + } + + protected void callDistributed(Integer workerId, String methodName, Object... methodArgs) { + if (workerId == null) { + workerId = ANY; + } + try { + DistributedCallMessage message = new DistributedCallMessage(); + message.setSenderWorkerId(getWorkerId()); + message.setSenderProcessId(getProcessId()); + message.setReceiverWorkerId(workerId); + message.setMethodName(methodName); + message.setMethodArgs(methodArgs); + mDistributedCallService.writeMessage(message); + } catch (Throwable exc) { + throw wrapException(exc); + } + } + + private void callMethod(String methodName, Object[] methodArgs) { + try { + Object target = this; + Class targetClass = target.getClass(); + Method targetMethod = findMethod(targetClass, methodName); + if (targetMethod == null) { + throw new RuntimeException("Cannot find target method: " + methodName); + } + targetMethod.invoke(target, methodArgs); + } catch (Throwable exc) { + throw wrapException(exc); + } + } + + private Method findMethod(Class clazz, String methodName) throws Exception { + Method result = null; + Method[] methods = clazz.getDeclaredMethods(); + for (int i = 0; i < methods.length; i++) { + if (methods[i].getName().equals(methodName)) { + if (result != null) { + throw new RuntimeException("Duplicate method name: " + methodName); + } + result = methods[i]; + } + } + return result; + } + + private RuntimeException wrapException(Throwable exception) { + if (exception instanceof RuntimeException) { + return (RuntimeException) exception; + } else { + return new RuntimeException(exception.getMessage(), exception); + } + } + + private void startWorkerThread() { + if (mWorkerThread != null) { + throw new IllegalStateException("WorkerThread is running"); + } + mWorkerThread = new WorkerThread(); + mWorkerThread.start(); + } + + private void stopWorkerThread() { + if (mWorkerThread == null) { + throw new IllegalStateException("WorkerThread is running"); + } + mWorkerThread.stopThread(); + } + + private class WorkerThread extends Thread { + + WorkerThread() { + setDaemon(true); + } + + public void run() { + try { + DistributedCallService service = mDistributedCallService; + while (true) { + if (isInterrupted()) { + System.out.println("#DBG: Worker isInterrupted"); + throw new InterruptedException(); + } + DistributedCallMessage message = + service.acceptMessage(getWorkerId(), getProcessId()); + if (message == null) { + Thread.sleep(1000); + } else { + processMessage(message); + } + } + } catch (InterruptedException exc) { + // Interruption terminates this thread. + // System.out.println("#DBG: Worker caught InterruptedException"); + } catch (Throwable exc) { + if (isDisconnectException(exc)) { + report("Server disconnected"); + } else { + reportError("Exception in WorkerThread: " + exc.getMessage(), exc); + System.exit(1); + } + } + report("WorkerThread terminated"); + } + + void stopThread() { + // System.out.println("#DBG: About to interrupt worker..."); + interrupt(); + // System.out.println("#DBG: Joining worker..."); + try { + join(); + } catch (InterruptedException exc) { + // ignore + } + } + + private boolean isDisconnectException(Throwable exc) { + if (exc instanceof java.rmi.ConnectException) { + return true; + } else if (exc instanceof java.rmi.NoSuchObjectException) { + return true; + } else if (exc instanceof java.rmi.UnmarshalException && + exc.getCause() != null && + exc.getCause() instanceof EOFException) { + return true; + } else { + return false; + } + } + } + + private void processMessage(DistributedCallMessage message) { + try { + Integer workerId = message.getReceiverWorkerId(); + if (workerId == null || !workerId.equals(getWorkerId())) { + reportError("Invalid worker ID in message: " + message); + return; + } + callMethod(message.getMethodName(), message.getMethodArgs()); + } catch (Throwable exc) { + reportError("Exception running message: " + message, exc); + } finally { + completeMessage(message); + } + } + + private void completeMessage(DistributedCallMessage message) { + try { + DistributedCallService service = mDistributedCallService; + service.completeMessage(getWorkerId(), getProcessId(), message.getCallId()); + } catch (Throwable exc) { + reportError("Exception completing message: " + message, exc); + } + } + + protected void report(String message) { + String identity = + getAlgorithmName() + " " + + getWorkerId() + "/" + getProcessId(); + System.out.println("# " + identity + " : " + message); + } + + protected void reportError(String message) { + reportError(message, null); + } + + protected void reportError(String message, Throwable exception) { + String identity = + getAlgorithmName() + " " + + getWorkerId() + "/" + getProcessId(); + System.out.println("Error" + + " [" + identity + "]" + + ": " + message); + if (exception != null) { + System.out.println(" with exception: " + exception.getMessage()); + exception.printStackTrace(System.out); + } + } + + private String getClassName() { + String name = getClass().getName(); + return name.substring(name.lastIndexOf('.')+1); + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("DistributedAlgorithm"); + builder.append("("); + builder.append("" + getAlgorithmName()); + builder.append(","); + builder.append("" + getWorkerId()); + builder.append(","); + builder.append("" + getProcessId()); + builder.append(","); + builder.append("" + getMaximumWorkerCount()); + builder.append(","); + builder.append("" + getLsfQueue()); + builder.append(","); + builder.append("" + mIsRunning); + builder.append(")"); + return builder.toString(); + } + + // This class is used only during in-process execution/testing/debugging. + private class InProcessWorker extends Thread { + + InProcessWorker() { + setDaemon(true); + } + + public void run() { + report("InProcessWorker starting"); + try { + String serverAddress = getServerHost() + ":" + getServerPort(); + String url = "rmi://" + serverAddress + "/DistributedCallService"; + DistributedCallService server = + (DistributedCallService) java.rmi.Naming.lookup(url); + DistributedAlgorithm algorithm = server.getAlgorithm(); + algorithm.setServerHost(getServerHost()); + algorithm.setServerPort(getServerPort()); + algorithm.runWorker(2, 1); + } catch (Throwable exc) { + reportError("Exception in InProcessWorker: " + exc.getMessage(), exc); + System.exit(1); + } + report("InProcessWorker terminated"); + } + } + + private static class LsfOutputFilter + extends FilterWriter { + + LsfOutputFilter() { + super(new PrintWriter(System.out, true)); + } + + public void write(int ch) + throws IOException { + if (mAtLineStart) { + out.write("# "); + mAtLineStart = false; + } + out.write(ch); + mAtLineStart = (ch == '\n'); + } + + public void write(String s, int off, int len) + throws IOException { + write(s.toCharArray(), off, len); + } + + public void write(char[] a, int off, int len) + throws IOException { + for (int i = 0; i < len; i++) { + write(a[off+i]); + } + } + + private boolean mAtLineStart = true; + } + + + private transient int mMaximumWorkerCount = 0; + private transient String mLsfQueue = null; + private transient String mLsfLogDirectory = null; + private transient boolean mEnableGcLogging = false; + private transient boolean mIsRunning = false; + private transient int mWorkerId = 0; + private transient int mProcessId = 0; + private transient WorkerThread mWorkerThread = null; + private transient String mAlgorithmName = null; + private transient String mServerHost = null; + private transient int mServerPort = 0; + private transient DistributedCallService mDistributedCallService = null; + private transient DistributedCallServer mDistributedCallServer = null; +} diff --git a/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java b/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java new file mode 100644 index 0000000000..dcee13eb80 --- /dev/null +++ b/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java @@ -0,0 +1,134 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2007 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp; + +import java.util.*; + +/** + * Command line driver for distributed worker invocation. + */ +public class DistributedAlgorithmWorker +{ + public static void main(String[] args) + throws Exception { + new DistributedAlgorithmWorker().run(args); + } + + private void run(String[] args) + throws Exception { + + if (!parseArguments(args)) { + System.exit(1); + } + System.out.println("# DistributedAlgorithmWorker"); + System.out.println("# Started at " + new Date()); + runDistributedWorker(); + System.out.println("# Ended at " + new Date()); + } + + private boolean parseArguments(String[] args) { + + int argpos = 0; + int argsleft = 0; + + while (argpos < args.length) { + argsleft = args.length - argpos; + String arg = args[argpos]; + if (arg.equals("-serverHost") && argsleft > 1) { + argpos++; + mServerHost = args[argpos++]; + } else if (arg.equals("-serverPort") && argsleft > 1) { + argpos++; + mServerPort = Integer.parseInt(args[argpos++]); + } else if (arg.equals("-workerId") && argsleft > 1) { + argpos++; + mWorkerId = new Integer(args[argpos++]); + } else if (arg.equals("-processId") && argsleft > 1) { + argpos++; + mProcessId = new Integer(args[argpos++]); + } else if (arg.equals("-debug")) { + argpos++; + mDebug = true; + continue; + } else if (arg.equals("-verbose")) { + argpos++; + mVerbose = true; + continue; + } else if (arg.startsWith("-")) { + usage(); + return false; + } else { + break; + } + } + + argsleft = args.length - argpos; + if (argsleft != 0) { + usage(); + return false; + } + + return true; + } + + private void usage() { + System.out.println("Usage: DistributedWorkerMain ..."); + System.out.println(" -serverHost "); + System.out.println(" -serverPort "); + System.out.println(" -workerId "); + System.out.println(" -processId "); + System.out.println(" -verbose"); + System.out.println(" -debug"); + } + + private void runDistributedWorker() + throws Exception { + + DistributedAlgorithm algorithm = null; + String serverAddress = getServerHost() + ":" + getServerPort(); + try { + String url = "rmi://" + serverAddress + "/DistributedCallService"; + DistributedCallService server = + (DistributedCallService) java.rmi.Naming.lookup(url); + algorithm = server.getAlgorithm(); + } catch (java.rmi.ConnectException exc) { + System.out.println("# Server " + serverAddress + " not responding."); + return; + } + + algorithm.setServerHost(getServerHost()); + algorithm.setServerPort(getServerPort()); + algorithm.runWorker(getWorkerId(), getProcessId()); + } + + private Integer getWorkerId() { + return mWorkerId; + } + + private Integer getProcessId() { + return mProcessId; + } + + private String getServerHost() { + return mServerHost; + } + + private int getServerPort() { + return mServerPort; + } + + + private boolean mDebug = false; + private boolean mVerbose = false; + private String mServerHost = null; + private int mServerPort = 0; + private Integer mWorkerId = null; + private Integer mProcessId = null; +} diff --git a/lib/edu/mit/broad/dcp/DistributedCallServer.java b/lib/edu/mit/broad/dcp/DistributedCallServer.java new file mode 100644 index 0000000000..995eff5717 --- /dev/null +++ b/lib/edu/mit/broad/dcp/DistributedCallServer.java @@ -0,0 +1,133 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp; + + +import edu.mit.broad.dcp.message.*; + +import java.rmi.server.UnicastRemoteObject; +import java.util.*; + +public class DistributedCallServer + extends UnicastRemoteObject + implements DistributedCallService +{ + public DistributedCallServer() + throws java.rmi.RemoteException { + } + + public void setAlgorithm(DistributedAlgorithm algorithm) { + mAlgorithm = algorithm; + } + + public DistributedAlgorithm getAlgorithm() { + return mAlgorithm; + } + + public long writeMessage(DistributedCallMessage message) { + message.setCallStatus(CallStatus.PENDING); + message.setCallId(generateCallId()); + if (message.getReceiverWorkerId().equals(0)) { + synchronized (mMessageQueue) { + mMessageQueue.addLast(message); + } + } else { + synchronized (mMessageQueue) { + mMessageQueue.addFirst(message); + } + } + return message.getCallId(); + } + + public DistributedCallMessage acceptMessage(int workerId, int processId) { + if (workerId <= 0) { + throw new IllegalArgumentException("Invalid worker ID: " + workerId); + } + if (processId <= 0) { + throw new IllegalArgumentException("Invalid process ID: " + processId); + } + synchronized (mMessageQueue) { + Iterator iterator = mMessageQueue.iterator(); + while (iterator.hasNext()) { + DistributedCallMessage message = iterator.next(); + if (message.getCallStatus() != CallStatus.PENDING) { + continue; + } + int receiverId = message.getReceiverWorkerId(); + if (receiverId == workerId || + (receiverId == 0 && workerId > 1)) { + message.setCallStatus(CallStatus.PROCESSING); + message.setReceiverWorkerId(workerId); + message.setReceiverProcessId(processId); + return message; + } + } + } + + return null; + } + + public void completeMessage(int workerId, int processId, long callId) { + if (workerId <= 0) { + throw new IllegalArgumentException("Invalid worker ID: " + workerId); + } + if (processId <= 0) { + throw new IllegalArgumentException("Invalid process ID: " + processId); + } + if (callId <= 0) { + throw new IllegalArgumentException("Invalid call ID: " + callId); + } + synchronized (mMessageQueue) { + Iterator iterator = mMessageQueue.iterator(); + while (iterator.hasNext()) { + DistributedCallMessage message = iterator.next(); + if (message.getCallId().longValue() == callId) { + if (message.getCallStatus() != CallStatus.PROCESSING) { + throw new IllegalStateException("Call #" + callId + " not in state PROCESSING"); + } + if (!message.getReceiverWorkerId().equals(workerId)) { + throw new IllegalStateException("Call #" + callId + " assigned to worker " + message.getReceiverWorkerId() + " not worker " + workerId); + } + if (!message.getReceiverProcessId().equals(processId)) { + throw new IllegalStateException("Call #" + callId + " assigned to process " + message.getReceiverProcessId() + " not process " + processId); + } + iterator.remove(); + return; + } + } + } + + throw new IllegalArgumentException("Unrecognized call ID " + callId); + } + + public boolean isQueueEmpty() { + synchronized (mMessageQueue) { + return mMessageQueue.isEmpty(); + } + } + + public void stop() { + try { + UnicastRemoteObject.unexportObject(this, false); + } catch (java.rmi.NoSuchObjectException exc) { + throw new RuntimeException("Exception unexporting object: " + exc.getMessage(), + exc); + } + } + + private synchronized long generateCallId() { + return ++mCallIdGenerator; + } + + private long mCallIdGenerator = 0; + private DistributedAlgorithm mAlgorithm = null; + private LinkedList mMessageQueue = + new LinkedList(); +} diff --git a/lib/edu/mit/broad/dcp/DistributedCallService.java b/lib/edu/mit/broad/dcp/DistributedCallService.java new file mode 100644 index 0000000000..202b25f42c --- /dev/null +++ b/lib/edu/mit/broad/dcp/DistributedCallService.java @@ -0,0 +1,25 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp; + +import edu.mit.broad.dcp.message.*; + +public interface DistributedCallService + extends java.rmi.Remote +{ + public DistributedAlgorithm getAlgorithm() + throws java.rmi.RemoteException; + public long writeMessage(DistributedCallMessage message) + throws java.rmi.RemoteException; + public DistributedCallMessage acceptMessage(int workerId, int processId) + throws java.rmi.RemoteException; + public void completeMessage(int workerId, int processId, long callId) + throws java.rmi.RemoteException; +} diff --git a/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java b/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java new file mode 100644 index 0000000000..1b0fa0a4d3 --- /dev/null +++ b/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java @@ -0,0 +1,90 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2007 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp.message; + +import edu.mit.broad.dcp.CallStatus; + +public class DistributedCallMessage + extends DistributedMessage +{ + public DistributedCallMessage() { + } + + public Long getCallId() { + return mCallId; + } + + public void setCallId(Long value) { + mCallId = value; + } + + public CallStatus getCallStatus() { + return mCallStatus; + } + + public void setCallStatus(CallStatus value) { + mCallStatus = value; + } + + public String getMethodName() { + return mMethodName; + } + + public void setMethodName(String value) { + mMethodName = value; + } + + public Object[] getMethodArgs() { + return mMethodArgs; + } + + public void setMethodArgs(Object[] value) { + mMethodArgs = value; + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("DistributedCallMessage"); + builder.append("("); + builder.append("" + getSenderWorkerId()); + builder.append(","); + builder.append("" + getSenderProcessId()); + builder.append(","); + builder.append("" + getReceiverWorkerId()); + builder.append(","); + builder.append("" + getReceiverProcessId()); + builder.append(","); + builder.append("" + mCallId); + builder.append(","); + builder.append("" + mCallStatus); + builder.append(","); + builder.append("" + mMethodName); + builder.append(","); + if (mMethodArgs == null) { + builder.append("" + mMethodArgs); + } else { + builder.append("["); + for (int i = 0; i < mMethodArgs.length; i++) { + if (i > 0) { + builder.append(","); + } + builder.append("" + mMethodArgs[i]); + } + builder.append("]"); + } + builder.append(")"); + return builder.toString(); + } + + public Long mCallId; + public CallStatus mCallStatus; + public String mMethodName; + public Object[] mMethodArgs; +} diff --git a/lib/edu/mit/broad/dcp/message/DistributedMessage.java b/lib/edu/mit/broad/dcp/message/DistributedMessage.java new file mode 100644 index 0000000000..a5e837a69d --- /dev/null +++ b/lib/edu/mit/broad/dcp/message/DistributedMessage.java @@ -0,0 +1,54 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2007 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.dcp.message; + + +public class DistributedMessage +{ + public DistributedMessage() { + } + + public Integer getSenderWorkerId() { + return mSenderWorkerId; + } + + public void setSenderWorkerId(Integer value) { + mSenderWorkerId = value; + } + + public Integer getSenderProcessId() { + return mSenderProcessId; + } + + public void setSenderProcessId(Integer value) { + mSenderProcessId = value; + } + + public Integer getReceiverWorkerId() { + return mReceiverWorkerId; + } + + public void setReceiverWorkerId(Integer value) { + mReceiverWorkerId = value; + } + + public Integer getReceiverProcessId() { + return mReceiverProcessId; + } + + public void setReceiverProcessId(Integer value) { + mReceiverProcessId = value; + } + + public Integer mSenderWorkerId; + public Integer mSenderProcessId; + public Integer mReceiverWorkerId; + public Integer mReceiverProcessId; +} diff --git a/lib/edu/mit/broad/picard/PicardException.java b/lib/edu/mit/broad/picard/PicardException.java new file mode 100644 index 0000000000..4e36ba6484 --- /dev/null +++ b/lib/edu/mit/broad/picard/PicardException.java @@ -0,0 +1,27 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard; + +/** + * Basic Picard runtime exception that, for now, does nothing much + * + * @author Kathleen Tibbetts + */ +public class PicardException extends RuntimeException +{ + public PicardException(String message) { + super(message); + } + + public PicardException(String message, Throwable throwable) { + super(message, throwable); + } + +} diff --git a/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java b/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java new file mode 100644 index 0000000000..54f0ab9aa4 --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java @@ -0,0 +1,97 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner; + +import edu.mit.broad.picard.io.IoUtil; + +import java.io.File; +import java.io.IOException; +import java.util.Map; +import java.util.List; + +/** + * Abstract base class for use by Aligner implementations. Provides a constructor and + * accessors for common inputs and outputs. + * + * @author Kathleen Tibbetts + */ +public abstract class AbstractBaseAligner implements Aligner { + + private final Stringency stringency; // The stringency of the alignment + private final File readsBamFile; // The BAM file containing the read data + private final String outputPrefix; // The directory and file name prefix for outputs + private final String referenceFileDir; // The directory where the reference file can be found + private final int clipPoints[]; // The clip points to use + private final Integer expectedInsertSize; // Expected insert size; null for non-paired-end lanes + private final Integer readsToAlign; // The number of reads to align (all if null) + private final boolean pairedReads; // Whether this is a paired-end run + private final int readLength; + // Parameters specific to the Aligner implementation being used + private final Map customParametersMap; + + /** + * Constructor that sets every parameter. + * + * @param stringency the stringency of the alignment + * @param readsBamFile the BAM file containing the reads + * @param outputPrefix the directory and filename prefix for output + * @param referenceFileDir the directory where the reference file is located + * @param clipPoints the clip points + * @param expectedInsertSize the expected insert size (null for non-PE lanes) + * @param readsToAlign the number of reads to align + * @param customParametersMap parameters specific to the Aligner implementation + */ + public AbstractBaseAligner(Stringency stringency, File readsBamFile, String outputPrefix, + String referenceFileDir, int clipPoints[], Integer expectedInsertSize, + Integer readsToAlign, Map customParametersMap, + boolean pairedReads, int readLength) { + + // First, a little validation + if (clipPoints != null && clipPoints.length != 4) { + throw new IllegalArgumentException("Length of clipPoints array argument must be 4."); + } + IoUtil.assertFileIsReadable(readsBamFile); + + this.stringency = stringency; + this.readsBamFile = readsBamFile; + this.outputPrefix = outputPrefix; + this.referenceFileDir = referenceFileDir; + this.clipPoints = clipPoints != null ? clipPoints : new int[4]; + this.expectedInsertSize = expectedInsertSize; + this.readsToAlign = readsToAlign; + this.customParametersMap = customParametersMap; + this.pairedReads = pairedReads; + this.readLength = readLength; + } + + /** + * Utility method for deleting a list of files, to be used by the + * cleanup method of sub-classes + * + * @param files the list of files to delete + */ + protected final void deleteFiles(List files) { + for (File f : files) { + f.delete(); + } + } + + // Accessors + protected final Stringency getStringency() { return stringency; } + protected final File getReadsBamFile() { return readsBamFile; } + protected final String getOutputPrefix() { return outputPrefix; } + protected final String getReferenceFileDir() { return referenceFileDir; } + protected final int[] getClipPoints() { return clipPoints; } + protected final Integer getExpectedInsertSize() { return expectedInsertSize; } + protected final Integer getReadsToAlign() { return readsToAlign; } + protected final Map getCustomParametersMap() { return customParametersMap; } + protected final boolean isPairedReads() { return pairedReads; } + protected final int getReadLength() { return readLength; } +} diff --git a/lib/edu/mit/broad/picard/aligner/Aligner.java b/lib/edu/mit/broad/picard/aligner/Aligner.java new file mode 100644 index 0000000000..d0fdf47deb --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/Aligner.java @@ -0,0 +1,45 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner; + +/** + * API for aligners. Clients must call these methods in order, as each depends on + * the previous one, but they may call them multiple times and need not call them all. + * This allows steps to be rerun and also lets the caller review intermediate files + * when troubleshooting. + * + * @author Kathleen Tibbetts + */ +public interface Aligner { + + public static enum Stringency{ low, high }; + + /** + * Prepares all the necessary inputs for the alignment process from a BAM file of read data. + */ + public void prepareInputs(); + + /** + * Does the alignment and produces output in the underlying form of the aligner. + */ + public void align(); + + /** + * Converts the output of the aligner to BAM format + */ + public void prepareOutput(); + + /** + * Cleans up intermediate files (the files created in by and for the underlying aligner by the + * prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file. + */ + public void cleanup(); + +} diff --git a/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java b/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java new file mode 100644 index 0000000000..1f3cd55ac8 --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java @@ -0,0 +1,319 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner.maq; + +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.filter.*; +import edu.mit.broad.picard.util.PeekableIterator; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.picard.sam.ReservedTagConstants; + +import java.io.File; +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Arrays; + +/** + * Class to take unmapped reads in BAM file format and create Maq binary fastq format file(s) -- + * one or two of them, depending on whether it's a paired-end read. This relies on the unmapped + * BAM file having all paired reads together in order. + */ +public class BamToBfqWriter { + + private final File bamFile; + private final String outputPrefix; + private boolean pairedReads = false; + private int wrote = 0; + private int increment = 1; + private int chunk = 0; + private BinaryCodec codec1; + private BinaryCodec codec2; + private final Log log = Log.getInstance(BamToBfqWriter.class); + + /** + * Constructor + * + * @param bamFile the BAM file to read from + * @param outputPrefix the directory and file prefix for the binary fastq files + * @param total the total number of records that should be written, drawn evenly + * from throughout the file (null for all). + * @param chunk the maximum number of records taht should be written to any one file + * @param pairedReads whether these reads are from a paired-end run + */ + public BamToBfqWriter(File bamFile, String outputPrefix, Integer total, Integer chunk, boolean pairedReads) { + this.bamFile = bamFile; + this.outputPrefix = outputPrefix; + this.pairedReads = pairedReads; + if (total != null) { + double writeable = (double)countWritableRecords(); + this.increment = (int)Math.floor(writeable/total.doubleValue()); + } + if (chunk != null) { + this.chunk = chunk; + } + } + + /** + * Constructor + * + * @param bamFile the BAM file to read from + * @param outputPrefix the directory and file prefix for the binary fastq files + * @param pairedReads whether these reads are from a paired-end run + */ + public BamToBfqWriter(File bamFile, String outputPrefix, boolean pairedReads) { + this(bamFile, outputPrefix, null, null, pairedReads); + } + + /** + * Writes the binary fastq file(s) to the output directory + */ + public void writeBfqFiles() { + + Iterator iterator = (new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator(); + + // Filter out noise reads and reads that fail the quality filter + TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1); + FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter(); + + if (!pairedReads) { + writeSingleEndBfqs(iterator, Arrays.asList(tagFilter, qualityFilter)); + codec1.close(); + } + else { + writePairedEndBfqs(iterator, tagFilter, qualityFilter); + codec1.close(); + codec2.close(); + } + log.info("Wrote " + wrote + " bfq records."); + + } + + /** + * Path for writing bfqs for paired-end reads + * + * @param iterator the iterator witht he SAM Records to write + * @param tagFilter the filter for noise reads + * @param qualityFilter the filter for PF reads + */ + private void writePairedEndBfqs(Iterator iterator, TagFilter tagFilter, + FailsVendorReadQualityFilter qualityFilter) { + // Open the codecs for writing + int fileIndex = 0; + initializeNextBfqFiles(fileIndex++); + + int records = 0; + + while (iterator.hasNext()) { + SAMRecord first = iterator.next(); + if (!iterator.hasNext()) { + throw new PicardException("Mismatched number of records in " + this.bamFile.getAbsolutePath()); + } + SAMRecord second = iterator.next(); + if (!second.getReadName().equals(first.getReadName()) || + first.getFirstOfPairFlag() == second.getFirstOfPairFlag()) { + throw new PicardException("Unmatched read pairs in " + this.bamFile.getAbsolutePath() + + ": " + first.getReadName() + ", " + second.getReadName() + "."); + } + + // If both are noise reads, filter them out + if (tagFilter.filterOut(first) && tagFilter.filterOut(second)) { + // skip it + } + // If either fails to pass filter, then exclude them as well + else if (qualityFilter.filterOut(first) || qualityFilter.filterOut(second)) { + // skip it + } + // Otherwise, write them out + else { + records++; + if (records % increment == 0) { + first.setReadName(first.getReadName() + "#0/1"); + writeFastqRecord(first.getFirstOfPairFlag() ? codec1 : codec2, first); + second.setReadName(second.getReadName() + "#0/2"); + writeFastqRecord(second.getFirstOfPairFlag() ? codec1 : codec2, second); + wrote++; + if (wrote % 1000000 == 0) { + log.info(wrote + " records written."); + } + if (chunk > 0 && wrote % chunk == 0) { + initializeNextBfqFiles(fileIndex++); + } + } + } + } + } + + /** + * Path for writing bfqs for single-end reads + * + * @param iterator the iterator witht he SAM Records to write + * @param filters the list of filters to be applied + */ + private void writeSingleEndBfqs(Iterator iterator, List filters) { + + // Open the codecs for writing + int fileIndex = 0; + initializeNextBfqFiles(fileIndex++); + + int records = 0; + + FilteringIterator it = new FilteringIterator(iterator, new AggregateFilter(filters)); + while (it.hasNext()) { + SAMRecord record = it.next(); + records++; + if (records % increment == 0) { + + writeFastqRecord(codec1, record); + wrote++; + if (wrote % 1000000 == 0) { + log.info(wrote + " records processed."); + } + if (chunk > 0 && wrote % chunk == 0) { + initializeNextBfqFiles(fileIndex++); + } + } + } + } + + /** + * Closes any the open bfq file(s), if any, and opens the new one(s) + * + * @param fileIndex the index (counter) of the files to write + */ + private void initializeNextBfqFiles(int fileIndex) { + // Close the codecs if they were writing before + if (codec1 != null) { + codec1.close(); + if (pairedReads) { + codec2.close(); + } + } + + // Open new file, using the fileIndex. + File bfq1 = getOutputFile(this.outputPrefix , 1, fileIndex); + codec1 = new BinaryCodec(IoUtil.openFileForWriting(bfq1)); + log.info("Now writing to file " + bfq1.getAbsolutePath()); + if (pairedReads) { + File bfq2 = getOutputFile(this.outputPrefix , 2, fileIndex); + codec2 = new BinaryCodec(IoUtil.openFileForWriting(bfq2)); + log.info("Now writing to file " + bfq2.getAbsolutePath()); + } + } + + /** + * Writes out a SAMRecord in Maq fastq format + * + * @param codec the code to write to + * @param rec the SAMRecord to write + */ + private void writeFastqRecord(BinaryCodec codec, SAMRecord rec) { + + // Writes the length of the read name and then the name (null-terminated) + codec.writeString(rec.getReadName(), true, true); + + char seqs[] = rec.getReadString().toCharArray(); + char quals[] = rec.getBaseQualityString().toCharArray(); + + // Write the length of the sequence + codec.writeInt(seqs.length); + + // Calculate and write the sequence and qualities + byte seqsAndQuals[] = new byte[seqs.length]; + + for (int i = 0; i < seqs.length; i++) { + int quality = Math.min(quals[i]-33, 63); + int base; + switch(seqs[i]) { + case 'A': + case 'a': + base = 0; + break; + case 'C': + case 'c': + base = 1; + break; + case 'G': + case 'g': + base = 2; + break; + case 'T': + case 't': + base = 3; + break; + case 'N': + case 'n': + case '.': + base = 0; + quality = 0; + break; + default: + throw new PicardException("Unknown base when writing bfq file: " + seqs[i]); + } + seqsAndQuals[i] = (byte) (base << 6 | quality); + } + codec.writeBytes(seqsAndQuals); + } + + private int countWritableRecords() { + int count = 0; + PeekableIterator it = new PeekableIterator((new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator()); + if (!this.pairedReads) { + // Filter out noise reads and reads that fail the quality filter + List filters = new ArrayList(); + filters.add(new TagFilter(ReservedTagConstants.XN, 1)); + filters.add(new FailsVendorReadQualityFilter()); + FilteringIterator itr = new FilteringIterator(it, new AggregateFilter(filters)); + while (itr.hasNext()) { + itr.next(); + count++; + } + } + else { + while (it.hasNext()) { + SAMRecord first = it.next(); + SAMRecord second = it.next(); + // If both are noise reads, filter them out + if (first.getAttribute(ReservedTagConstants.XN) != null && + second.getAttribute(ReservedTagConstants.XN) != null) { + // skip it + } + // If either fails to pass filter, then exclude them as well + else if (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag() ) { + // skip it + } + // Otherwise, write them out + else { + count++; + } + } + } + it.close(); + return count; + } + + /** + * Constructs the name for the output file and returns the file + * + * @param outputPrefix the directory and file prefix for the output bfq file + * @param read whether this is the file for the first or second read + * @return a new File object for the bfq file. + */ + private File getOutputFile(String outputPrefix, int read, int index) { + File result = new File(outputPrefix + "." + index + "." + read + ".bfq"); + IoUtil.assertFileIsWritable(result); + return result; + } + +} diff --git a/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java b/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java new file mode 100644 index 0000000000..af55741853 --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java @@ -0,0 +1,357 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner.maq; + +import edu.mit.broad.sam.*; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.StringUtil; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.util.SamPairUtil; + +import java.io.File; +import java.io.BufferedInputStream; +import java.util.*; + +/** + * Reads a Maq map file and returns an an iterator of SAMRecords and a populated header + * + * IMPORTANT! Even though the reads in the map file are in coordinate order, this iterator + * will not necessarily return them in that order. For paired reads, both will be + * returned only after *both* records have been seen. + * + * @author Kathleen Tibbetts + */ +public class MapFileIterator implements CloseableIterator { + + public static final int MATE_UNMAPPED_FLAG = 64; + public static final int READ_UNMAPPED_FLAG = 192; + + private static final int READ_NAME_LENGTH = 36; + private static final int MAP_FORMAT = -1; + private static final int MAX_READ_LENGTH = 128; + + private static final byte ACGT[] = {'A', 'C', 'G', 'T'}; + + public static final String PROGRAM_RECORD = "0"; + + private long recordCount = 0L; + private int recordsRead = 0; + private BinaryCodec mapCodec; + private final SAMFileHeader header; + private final boolean pairedReads; + private final boolean jumpingLibrary; + private final List next = new ArrayList(); + private final Map pending = new HashMap(); + private final List mapFiles = new LinkedList(); + + /** + * Constructor. Opens the map file, reads the record count and header from it, + * creates the SAMFileHeader, and queues up the first read + * + * @param mapFile The Maq map file to read + * @param commandLine The command line used to invoke Maq (for the header) + * @param pairedReads Whether this is a paired-end run + */ + public MapFileIterator(String commandLine, boolean pairedReads, boolean jumpingLibrary, File... mapFile) { + if (mapFile.length == 0) { + throw new IllegalArgumentException("At least one map file must be provided."); + } + mapFiles.addAll(Arrays.asList(mapFile)); + + this.pairedReads = pairedReads; + this.jumpingLibrary = jumpingLibrary; + + header = new SAMFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + SAMProgramRecord program = new SAMProgramRecord(PROGRAM_RECORD); + program.setProgramVersion(MaqConstants.getProgramVersion()); + program.setCommandLine(commandLine); + header.addProgramRecord(program); + + queueNextMapFile(); + } + + /** + * Queues up the next map file + * + * @return true if there's another map file to iterate over + */ + private boolean queueNextMapFile() { + + // Close the old file + if (mapCodec != null) { + mapCodec.close(); + } + + // If there are no more map files, return fales + if (mapFiles.size() == 0) { + return false; + } + + // Otherwise, open the next file and reset the recordsRead count + mapCodec = new BinaryCodec(new BufferedInputStream(IoUtil.openFileForReading(mapFiles.remove(0)))); + int format = mapCodec.readInt(); + if (format != MAP_FORMAT) { + mapCodec.close(); + throw new PicardException("Unrecognized Maq map file format: " + format); + } + recordsRead = 0; + + + // Read the sequences out of the map file and set them on the header + int sequenceCount = mapCodec.readInt(); + List sequences = new ArrayList(); + for (int i = 0; i < sequenceCount; i++) { + int length = mapCodec.readInt(); + // Write the sequence name, trimming off the null terminator + sequences.add(new SAMSequenceRecord(mapCodec.readString(length).substring(0, length-1))); + } + if (header.getSequences() == null || header.getSequences().size() == 0) { + header.setSequences(sequences); + } + else { + // TODO: Check that the sequences match and throw and exception if they don't + } + recordCount = mapCodec.readLong(); + + readNext(); + return true; + } + + /** + * Closes the BinaryCodec reading the map file + */ + public void close() { + mapCodec.close(); + } + + /** + * @return true if the iteration has more elements + */ + public boolean hasNext() { + return next.size() > 0; + } + + /** + * @return the next SAMRecord in the iteration + * @throws NoSuchElementException if this is called when hasNext() returns false + */ + public SAMRecord next() { + if (!hasNext()) { + throw new NoSuchElementException("No more elements in this iteration"); + } + SAMRecord result = next.remove(0); + readNext(); + return result; + } + + /** + * Reads the next element from the map file. If we are done with it, we put it in the next + * list; if we are waiting to see its mate, we put it in the pending map. Calls itself + * repeatedly until there is at least one element in next. + */ + private void readNext() { + + // If there's already a record queued up, just return + if (next.size() > 0) { + return; + } + + // If we've read all there is, then any remaining records in the pending map should be returned. + // If this is not a PE run, then the pending map will be empty and we're done. + if (recordsRead == recordCount) { + if (pending.size() > 0) { + StringBuffer sb = new StringBuffer(); + for (String item : pending.keySet()) { + sb.append(item).append("\n"); + } + throw new PicardException("MapFileIterator pending map should have been empty but contained " + + "the following records: " + sb.toString()); + } + queueNextMapFile(); + return; + } + + // Otherwise, we read until there is at least one record in the next list + readMapRecord(); + if (next.size() == 0) { + readNext(); + } + } + + /** + * Reads one record from the map file and throws it onto the pending map or the next list, + * depending on whether we have already seen its mate + */ + private void readMapRecord() { + + // Now that we've got all the data from the binary file, write a SAMRecord and add it to + // the new BAM file + SAMRecord record = new SAMRecord(); + record.setAttribute(SAMTag.PG.toString(), PROGRAM_RECORD); + record.setReadPairedFlag(this.pairedReads); + + // the last base is the single-end mapping quality. + byte seqsAndQuals[] = new byte[MAX_READ_LENGTH-1]; + mapCodec.readBytes(seqsAndQuals); + + byte singleEndMappingQualityOrIndelLength = mapCodec.readByte(); + + // the length of the read + int readLength = mapCodec.readUByte(); + setSeqsAndQuals(seqsAndQuals, readLength, record); + + // the final mapping quality (unless flag below is 130, then it is the + // position of the indel (or 0 if no indel) + int mappingQuality = mapCodec.readUByte(); + + // mismatches in the 28bp (higher 4 bits) and mismatches (lower 4 bits) + mapCodec.readUByte(); + // sum of errors of the best hit + mapCodec.readUByte(); + // counts of all 0- and 1-mismatch hits on the reference + mapCodec.readUByte(); + mapCodec.readUByte(); + + // A bitwise flag. See the Maq docs for its full meaning + int flag = mapCodec.readUByte(); + + // the lower mapQ of the two ends (equals map_qual if unpaired); if flag is 130: mapQ of its mate + int altQual = mapCodec.readUByte(); + + // Index of the sequence for this read + record.setReferenceIndex((int)mapCodec.readUInt(), getHeader()); + + // Start position and strand + long pos = mapCodec.readUInt(); + int startPos = ((int)((pos>>1)& 0x7FFFFFFF)) + 1; + record.setAlignmentStart(startPos); + record.setReadNegativeStrandFlag((pos&1) == 1); + + // offset of the mate (zero if unpaired, or two ends mapped to different chr) + mapCodec.readInt(); + + // The read name + byte nameBytes[] = new byte[READ_NAME_LENGTH]; + mapCodec.readBytes(nameBytes); + String name = StringUtil.bytesToString(nameBytes).trim(); + if (this.pairedReads) { + if (name.endsWith("/1")) { + record.setFirstOfPairFlag(true); + record.setSecondOfPairFlag(false); + } + else if (name.endsWith("/2")) { + record.setFirstOfPairFlag(false); + record.setSecondOfPairFlag(true); + } + else { + throw new PicardException("Unrecognized ending for paired read name: " + name); + } + name = name.substring(0, name.length()-2); + } + record.setReadName(name); + + + if (flag != 130 || singleEndMappingQualityOrIndelLength == 0) { // No indel + record.setCigarString(readLength + "M"); + record.setMappingQuality(mappingQuality); + } + else { // Indel + int indelPos = mappingQuality; + String cigar = indelPos + "M" + Math.abs(singleEndMappingQualityOrIndelLength); + int remaining = readLength - indelPos; + if (singleEndMappingQualityOrIndelLength > 0) { + cigar += "I" + (remaining - singleEndMappingQualityOrIndelLength) + "M"; + } + else { + cigar += "D" + remaining + "M"; + } + record.setCigarString(cigar); + // In the docs, it look like there is a mapping quality for the mate, do we use that? + record.setMappingQuality(altQual); + } + + if (!pairedReads) { + record.setProperPairFlag(false); + next.add(record); + } + else { + record.setMateUnmappedFlag(flag == MATE_UNMAPPED_FLAG); + SAMRecord mate = pending.remove(record.getReadName()); + + if (mate != null) { + boolean proper = SamPairUtil.isProperPair(record, mate, jumpingLibrary); + record.setProperPairFlag(proper); + mate.setProperPairFlag(proper); + + SamPairUtil.setMateInfo(record, mate); + + int insertSize = SamPairUtil.computeInsertSize(record, mate); + record.setInferredInsertSize(insertSize); + mate.setInferredInsertSize(insertSize); + + if (!mate.getMateUnmappedFlag()) { + next.add(record); + } + if (!record.getMateUnmappedFlag()) { + next.add(mate); + } + } + else { + pending.put(record.getReadName(), record); + } + } + + // TODO: Figure out what do do about noise reads long-term + // Note that it is possible that we have lost a "Noise read" annotation at this point. Since + // we try to map a pair if only one of the reads is classified as "noise", then for any paired + // reads where one was a noise read and one was not, we will lose the noise annotation on the + // one noisy read. We have discussed either re-doing the noise evaluation here, modifying the + // read name to carry the noise flag through Maq, or changing what reads we give to Maq. + + recordsRead++; + + } + + /** + * Decodes the sequence and the qualities and sets them on the SAMrecords + * + * @param seqsAndQuals the list of seqs and quals + * @param readLength the length of the read + * @param sam the SAMRecord to populate + */ + private void setSeqsAndQuals(byte seqsAndQuals[], int readLength, SAMRecord sam) { + byte sequence[] = new byte[readLength]; + byte qualities[] = new byte[readLength]; + for (int i = 0; i < readLength; i++) { + byte b = seqsAndQuals[i]; + qualities[i] = (byte)(b & 0x3F); + if (b == 0) { + sequence[i] = 'N'; + } + else { + sequence[i] = ACGT[(seqsAndQuals[i] >> 6) & 3]; + } + } + sam.setReadBases(sequence); + sam.setBaseQualities(qualities); + } + + /** + * @throws UnsupportedOperationException -- not implemented + */ + public void remove() { + throw new UnsupportedOperationException("remove() not supported in MapFileIterator"); + } + + public SAMFileHeader getHeader() { return header; } +} diff --git a/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java b/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java new file mode 100644 index 0000000000..6c1890818b --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java @@ -0,0 +1,211 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner.maq; + +import edu.mit.broad.picard.aligner.Aligner; +import edu.mit.broad.picard.aligner.AbstractBaseAligner; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.util.Log; + +import java.io.File; +import java.io.FilenameFilter; +import java.util.*; + +/** + * Maq implementation of the Aligner interface + */ +public class MaqAligner extends AbstractBaseAligner implements Aligner { + + // Constants related to Maq output files + public static final String MAQ_MAP_SUFFIX = ".out.aln.map"; + public static final String MAQ_LOG_SUFFIX = ".out.map.log"; + + // Internal constant for multi-plexing lane data + private static final int READ_CHUNK_SIZE = 2000000; + + public static final String REFERENCE_FILE_SUFFIX = ".bfa"; + + private final Log log = Log.getInstance(MaqAligner.class); + + private String commandLine = null; + + + /** + * Constructor that sets every parameter. All other constructors delegate to this one. + * + * @param stringency the stringency of the alignment + * @param readsBamFile the BAM file containing the reads + * @param outputPrefix the directory and filename prefix for output + * @param referenceFileDir the directory where the reference file is located + * @param clipPoints the clip points + * @param expectedInsertSize the expected insert size (null for non-PE lanes) + * @param readsToAlign the number of reads to align + * @param customParametersMap parameters specific to the Aligner implementation + */ + public MaqAligner(Stringency stringency, File readsBamFile, String outputPrefix, + String referenceFileDir, int clipPoints[], Integer expectedInsertSize, + Integer readsToAlign, Map customParametersMap, + boolean pairedReads, int readLength) { + + super(stringency, readsBamFile, outputPrefix, referenceFileDir, clipPoints, + expectedInsertSize, readsToAlign, customParametersMap, pairedReads, readLength); + } + + /** + * Prepares all the necessary inputs for the alignment process from a BAM file of read data. + */ + public void prepareInputs() { + log.info("Preparing Maq inputs."); + BamToBfqWriter writer = new BamToBfqWriter(this.getReadsBamFile(), this.getOutputPrefix(), + this.getReadsToAlign(), READ_CHUNK_SIZE, isPairedReads()); + writer.writeBfqFiles(); + } + + /** + * Does the alignment and produces output in the underlying form of the aligner. + */ + public void align() { + log.info("Running Maq alignment."); + + // Temporary hack until we get the multi-tasking code from Seva + List mapFileNames = new ArrayList(); // All map files that we will merge together at the end + + String maqParams = MaqConstants.SWITCH_RANDOM_SEED + " " + MaqConstants.DEFAULT_RANDOM_SEED; + + if (this.getStringency() == Stringency.high) { + maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + Math.round( + this.getExpectedInsertSize() * MaqConstants.HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER); + maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " + + MaqConstants.HIGH_STRINGENCY_SUM_MISMATCHES; + } + else { + maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + + MaqConstants.LOW_STRINGENCY_MAX_OUTER_DISTANCE; + // For low stringency, get at least 30 bases and then let half of what's remaining mismatch + int maxMisMatches = (this.getReadLength() - 30)/2; + maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " + + (maxMisMatches * MaqConstants.LOW_STRINGENCY_QUALITY_FOR_MISMATCHES); + } + + String referenceFile = new File(this.getReferenceFileDir()).listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.endsWith(REFERENCE_FILE_SUFFIX); + } + })[0].getAbsolutePath(); + + ProcessBuilder builder; + + // Map the bfq files, individually or in pairs + SortedSet bfqs = new TreeSet(this.getBfqFiles()); + for (Iterator it = bfqs.iterator(); it.hasNext();) { + + String read1bfq = it.next().getAbsolutePath(); + String read2bfq = (this.isPairedReads()) ? it.next().getAbsolutePath() : ""; + + String outputFileBase = read1bfq.substring(0, read1bfq.lastIndexOf('.')-2); + String mapFile = outputFileBase + MAQ_MAP_SUFFIX; + String logFile = outputFileBase + MAQ_LOG_SUFFIX; + + String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + MaqConstants.MAP_COMMAND + + " " + maqParams + " " + mapFile + " " + referenceFile + " " + read1bfq + " " + read2bfq + + " 2> " + logFile; + setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command); + log.info("Executing command: " + command); + try { + builder = new ProcessBuilder(command.split(" ")); + Process p = builder.start(); + p.waitFor(); + } + catch (Exception e) { + throw new PicardException("Error starting Maq process", e); + } + + mapFileNames.add(mapFile); + } + + // If there's more than one map file, then merge them. + String finalFileName = this.getOutputPrefix() + "." + this.getStringency() + MAQ_MAP_SUFFIX; + if (mapFileNames.size() > 1) { + String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + + MaqConstants.MERGE_COMMAND + " " + finalFileName; + for (String name : mapFileNames) { + command += " " + name; + } + setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command); + log.info("Executing command: " + command); + + try { + builder = new ProcessBuilder(command.split(" ")); + Process p = builder.start(); + p.waitFor(); + } + catch (Exception e) { + throw new PicardException("Error starting Maq process", e); + } + } + else { // Otherwise rename the single map file so we can find it later + File f = new File(mapFileNames.get(0)); + if (!f.renameTo(new File(finalFileName))) { + throw new PicardException("Error renaming " + f.getAbsolutePath() + " to " + finalFileName); + } + } + } + + /** + * Converts the output of the aligner to BAM format + */ + public void prepareOutput() { + log.info("Preparing output from Maq alignment."); + // TODO: MaqToBam + } + + /** + * Cleans up intermediate files (the files created in by and for the underlying aligner by the + * prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file. + */ + public void cleanup() { + log.info("Cleaning up Maq intermediate files."); + this.deleteFiles(getBfqFiles()); +// this.deleteFiles(getMaqAlignmentFiles()); + } + + /** + * Returns a list of zero to two BFQ files, depending on whether they are there + * and whether it was a paired-end run or not + * + * @return a list of BFQ files + */ + private List getBfqFiles() { + File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/"))); + return Arrays.asList(dir.listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.endsWith(".bfq"); + } + })); + } + + /** + * Returns the Maq map files + * + * @return a list of Maq .map files + */ + private List getMaqAlignmentFiles() { + File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/"))); + return Arrays.asList(dir.listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + // TODO: Add the text files if we do not read the binary map files + return name.endsWith(MAQ_MAP_SUFFIX) || name.endsWith(MAQ_LOG_SUFFIX); + } + })); + } + + public String getCommandLine() { return commandLine; } + public void setCommandLine(String commandLine) { this.commandLine = commandLine; } +} diff --git a/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java b/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java new file mode 100644 index 0000000000..b5e4b9b59b --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java @@ -0,0 +1,39 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner.maq; + +/** + * Utility class to hold Maq-related constants (program name, location, switches, etc) + */ +public class MaqConstants { + // General Maq constants + public static final String PROGRAM_NAME = "Maq"; + public static final String PROGRAM_VERSION = "0.7.1"; + public static final String MAQ_HOME = "/seq/dirseq/maq-0.7.1/"; + + // Command-related constants + public static final String MAQ_COMMAND = "maq"; + public static final String MAP_COMMAND = "map"; + public static final String MERGE_COMMAND = "mapmerge"; + + // Constants related to Maq map switches + public static final String SWITCH_SUM_MISMATCHES = "-e"; + public static final int HIGH_STRINGENCY_SUM_MISMATCHES = 100; + public static final int LOW_STRINGENCY_QUALITY_FOR_MISMATCHES = 30; + + public static final String SWITCH_MAX_OUTER_DISTANCE = "-a"; + public static final int LOW_STRINGENCY_MAX_OUTER_DISTANCE = 1500; + public static final double HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER = 1.5d; + + public static final String SWITCH_RANDOM_SEED = "-s"; + public static final int DEFAULT_RANDOM_SEED = 0; + + public static String getProgramVersion() { return PROGRAM_VERSION; } +} diff --git a/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java b/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java new file mode 100644 index 0000000000..3b82cc1063 --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java @@ -0,0 +1,125 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner.maq; + +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.util.StringSortingCollectionFactory; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.sam.util.SortingCollection; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.*; + +import java.io.File; +import java.io.BufferedInputStream; +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; +import java.nio.ByteBuffer; + +/** + * Class to write a BAM file that includes the results from a Maq .map file along with the unaligned + * reads from the original BAM file. + * + * Information on the meaning of the elements of the map file is drawn from the Maq documentation + * on this page: http://maq.sourceforge.net/maqmap_format.shtml + */ +public class MaqMapMerger { + + private final File mapFile; + private final File sourceBamFile; + private final File targetBamFile; + private final boolean pairedReads; + private final Log log = Log.getInstance(MaqMapMerger.class); + private String commandLine = null; + private List sequences = new ArrayList(); + + + /** + * Constructor + * + * @param mapFile The Maq map file to parse + * @param sourceBamFile The BAM file that was used as the input to the Maq aligner, which will + * include info on all the reads that did not map + * @param targetBamFile The file to which to write the merged + */ + public MaqMapMerger(File mapFile, File sourceBamFile, File targetBamFile, boolean pairedReads) { + IoUtil.assertFileIsReadable(mapFile); + IoUtil.assertFileIsReadable(sourceBamFile); + IoUtil.assertFileIsWritable(targetBamFile); + this.mapFile = mapFile; + this.sourceBamFile = sourceBamFile; + this.targetBamFile = targetBamFile; + this.pairedReads = pairedReads; + } + + /** + * Merges the alignment from the map file with the remaining records from the source BAM file. + */ + public void mergeAlignment() { + log.info("Processing map file: " + mapFile.getAbsolutePath()); + // Write the header + MapFileIterator it = new MapFileIterator(getCommandLine(), this.pairedReads, false, this.mapFile); + SAMFileHeader header = it.getHeader(); + SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, targetBamFile); + + // Write the alignments + SortingCollection readNames = writeAlignments(it, writer); + + // We're done with the map file, so close it + it.close(); + writeUnalignedReads(writer, readNames.iterator()); + + // Now close the writer + writer.close(); + } + + + private void writeUnalignedReads(SAMFileWriter writer, CloseableIterator nameIterator) { + + int skipCount = 0; + SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(this.sourceBamFile)); + CloseableIterator bamRecords = reader.iterator(); + + String readName = nameIterator.hasNext() ? nameIterator.next() : null; + while(bamRecords.hasNext()) { + SAMRecord rec = bamRecords.next(); + if (rec.getReadName().equals(readName)) { + // skip it and pull the next name off the name iterator + readName = nameIterator.hasNext() ? nameIterator.next() : null; + skipCount++; + } + else { + writer.addAlignment(rec); + } + } +System.out.println("Skipped " + skipCount + " already-aligned records."); + bamRecords.close(); + nameIterator.close(); + } + + private SortingCollection writeAlignments(MapFileIterator iterator, SAMFileWriter writer) { + +int wrote = 0; + SortingCollection readNames = StringSortingCollectionFactory.newCollection(); + while (iterator.hasNext()) { + SAMRecord record = iterator.next(); + readNames.add(record.getReadName()); + writer.addAlignment(record); +wrote++; + } +System.out.println("Wrote " + wrote + " alignment records."); + return readNames; + } + + public void setCommandLine(String commandLine) { this.commandLine = commandLine; } + public String getCommandLine() { return this.commandLine; } +} diff --git a/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java b/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java new file mode 100644 index 0000000000..bc3741b029 --- /dev/null +++ b/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java @@ -0,0 +1,133 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.aligner.maq; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.aligner.Aligner; + +import java.io.File; +import java.util.Map; +import java.util.List; +import java.util.HashMap; +import java.util.ArrayList; + +/** + * CommandLineProgram to generate to invoke BustardToBamWriter + * + * @author Kathleen Tibbetts + */ +public class RunMaq extends CommandLineProgram { + private static final String PROGRAM_VERSION = "1.0"; + + // The following attributes define the command-line arguments + @Usage + public String USAGE = + "Usage: " + getClass().getName() + " [options]\n\n" + + "Invoke the Maq aligner.\n" + + "Version: " + PROGRAM_VERSION +"\n"; + + @Option(shortName="I", doc="The BAM file to parse.", optional=true) + public File INPUT; + @Option(shortName="O", doc="The directory and file prefix for all output.", optional=false) + public String OUTPUT; + @Option(shortName="L", doc="The read length.", optional=false) + public Integer READ_LENGTH; + @Option(shortName="S", doc="Stringency of the alignment.", optional=true) + public Aligner.Stringency STRINGENCY; + @Option(shortName="R", doc="Directory where the reference file is located.", optional=true) + public String REFERENCE; + @Option(shortName="C", doc="Clip points for the alignment.", optional=true, minElements=0, maxElements=4) + public List CLIP_POINT = new ArrayList(); + @Option(shortName="E", doc="Expected insert size.", optional=true) + public Integer EXPECTED_INSERT_SIZE; + @Option(doc="Whether this is a paired-end run.", optional=false) + public Boolean PE; + @Option(shortName="NUM", doc="Number of reads to align (null = all).", optional=true) + public Integer READS_TO_ALIGN; + @Option(shortName="CUSTOM", doc="Custom parameter in the form name=value.", optional=true) + public List CUSTOM_PARAMETER = new ArrayList(); + @Option(shortName="PREP", doc="Whether to prepare inputs for the alignement.", optional=true) + public Boolean PREPARE = true; + @Option(doc="Whether to do the alignement.", optional=true) + public Boolean ALIGN = true; + @Option(shortName="BAM", doc="Whether to generate a BAM file from the alignment output.", optional=true) + public Boolean BAM_OUTPUT = true; + @Option(doc="Whether to clean up intermediate input and output.", optional=true) + public Boolean CLEANUP = true; + + protected int doWork() { + int clipPoints[] = null; + if (CLIP_POINT != null) { + clipPoints = new int[4]; + int index=0; + for (Integer i : CLIP_POINT) { + clipPoints[index++] = i; + } + } + Map params = null; + if (CUSTOM_PARAMETER != null) { + params = new HashMap(); + for (String param : CUSTOM_PARAMETER) { + String nameAndVal[] = param.split("="); + params.put(nameAndVal[0], nameAndVal[1]); + } + } + Aligner aligner = new MaqAligner(STRINGENCY, INPUT, OUTPUT, REFERENCE, clipPoints, + EXPECTED_INSERT_SIZE, READS_TO_ALIGN, params, PE, READ_LENGTH); + if (PREPARE) { + aligner.prepareInputs(); + } + if (ALIGN) { + aligner.align(); + } + if (BAM_OUTPUT) { + aligner.prepareOutput(); + } + if (CLEANUP) { + aligner.cleanup(); + } + return 0; + } + + /** + * This is kind of a mess. Almost everything is optional, since you don't have to do all of the steps in the + * alignement. + * @return + */ + protected boolean customCommandLineValidation() { + if (PREPARE) { + if( INPUT == null) { + System.err.println("ERROR: INPUT must be specified when preparing inputs for the alignment."); + return false; + } + if (CLIP_POINT.size() != 0 && CLIP_POINT.size() != 4) { + System.err.println("ERROR: You must supply either 0 or 4 values for CLIP_POINT: " + CLIP_POINT.size()); + return false; + } + } + if (ALIGN) { + if (STRINGENCY == null) { + System.err.println("ERROR: STRINGENCY must be specified when doing an alignment."); + return false; + } + if (REFERENCE == null) { + System.err.println("ERROR: REFERENCE must be specified when doing an alignment."); + return false; + } + } + return true; + } + + public static void main(String[] argv) { + System.exit(new RunMaq().instanceMain(argv)); + } +} diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java b/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java new file mode 100644 index 0000000000..cfe74bbccf --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java @@ -0,0 +1,27 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +public class CommandLineParseException extends RuntimeException{ + public CommandLineParseException() { + } + + public CommandLineParseException(String s) { + super(s); + } + + public CommandLineParseException(String s, Throwable throwable) { + super(s, throwable); + } + + public CommandLineParseException(Throwable throwable) { + super(throwable); + } +} diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java b/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java new file mode 100644 index 0000000000..69b681abb4 --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java @@ -0,0 +1,638 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +import java.io.*; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.ParameterizedType; +import java.lang.reflect.Type; +import java.util.*; + +import edu.mit.broad.picard.util.StringUtil; +import edu.mit.broad.picard.PicardException; + +/** + * Annotation-driven utility for parsing command-line arguments, checking for errors, and producing usage message. + * + * This class supports options of the form KEY=VALUE, plus positional arguments. Positional arguments must not contain + * an equal sign lest they be mistaken for a KEY=VALUE pair. + * + * The caller must supply an object that both defines the command line and has the parsed options set into it. + * For each possible KEY=VALUE option, there must be a public data member annotated with @Option. The KEY name is + * the name of the data member. An abbreviated name may also be specified with the shortName attribute of @Option. + * If the data member is a List, then the option may be specified multiple times. The type of the data member, + * or the type of the List element must either have a ctor T(String), or must be an Enum. List options must + * be initialized by the caller with some kind of list. Any other option that is non-null is assumed to have the given + * value as a default. If an option has no default value, and does not have the optional attribute of @Option set, + * is required. For List options, minimum and maximum number of elements may be specified in the @Option annotation. + * + * A single List data member may be annotated with the @PositionalArguments. This behaves similarly to a Option + * with List data member: the caller must initialize the data member, the type must be constructable from String, and + * min and max number of elements may be specified. If no @PositionalArguments annotation appears in the object, + * then it is an error for the command line to contain positional arguments. + * + * A single String public data member may be annotated with @Usage. This string, if present, is used to + * construct the usage message. Details about the possible options are automatically appended to this string. + * If @Usage does not appear, a boilerplate usage message is used. + */ +public class CommandLineParser { + // For formatting option section of usage message. + private static final int OPTION_COLUMN_WIDTH = 30; + private static final int DESCRIPTION_COLUMN_WIDTH = 50; + + private static final Boolean[] TRUE_FALSE_VALUES = {Boolean.TRUE, Boolean.FALSE}; + + // Use these if no @Usage annotation + private static final String defaultUsagePreamble = "Usage: program [options...]\n"; + private static final String defaultUsagePreambleWithPositionalArguments = + "Usage: program [options...] [positional-arguments...]\n"; + private static final String OPTIONS_FILE = "OPTIONS_FILE"; + + /** + * A typical command line program will call this to get the beginning of the usage message, + * and then append a description of the program, like this: + * + * \@Usage(programVersion=PROGRAM_VERSION) + * public String USAGE = CommandLineParser.getStandardUsagePreamble(getClass()) + "Frobnicates the freebozzle." + */ + public static String getStandardUsagePreamble(Class mainClass) { + return "USAGE: " + mainClass.getName() + " [options]\n\n"; + } + + // This is the object that the caller has provided that contains annotations, + // and into which the values will be assigned. + private final Object callerOptions; + + private String usagePreamble; + // null if no @PositionalArguments annotation + private Field positionalArguments; + private int minPositionalArguments; + private int maxPositionalArguments; + + // List of all the data members with @Option annotation + private final List optionDefinitions = new ArrayList(); + + // Maps long name, and short name, if present, to an option definition that is + // also in the optionDefinitions list. + private final Map optionMap = new HashMap(); + + // For printing error messages when parsing command line. + private PrintStream messageStream; + + // In case implementation wants to get at arg for some reason. + private String[] argv; + + + /** + * This attribute is here just to facilitate printing usage for OPTIONS_FILE + */ + public File IGNORE_THIS_PROPERTY; + + /** + * Prepare for parsing command line arguments, by validating annotations. + * @param callerOptions This object contains annotations that define the acceptable command-line options, + * and ultimately will receive the settings when a command line is parsed. + */ + public CommandLineParser(final Object callerOptions) { + this.callerOptions = callerOptions; + + for (final Field field : this.callerOptions.getClass().getFields()) { + if (field.getAnnotation(PositionalArguments.class) != null) { + handlePositionalArgumentAnnotation(field); + } + if (field.getAnnotation(Usage.class) != null) { + handleUsageAnnotation(field); + } + if (field.getAnnotation(Option.class) != null) { + handleOptionAnnotation(field); + } + } + + if (usagePreamble == null) { + if (positionalArguments == null) { + usagePreamble = defaultUsagePreamble; + } else { + usagePreamble = defaultUsagePreambleWithPositionalArguments; + } + } + } + + /** + * Print a usage message based on the options object passed to the ctor. + * @param stream Where to write the usage message. + */ + public void usage(final PrintStream stream) { + stream.print(usagePreamble); + if (!optionDefinitions.isEmpty()) { + stream.println("\nOptions:\n"); + for (final OptionDefinition optionDefinition : optionDefinitions) { + printOptionUsage(stream, optionDefinition); + } + } + final Field fileField; + try { + fileField = getClass().getField("IGNORE_THIS_PROPERTY"); + } catch (NoSuchFieldException e) { + throw new PicardException("Should never happen", e); + } + final OptionDefinition optionsFileOptionDefinition = + new OptionDefinition(fileField, OPTIONS_FILE, "", + "File of OPTION_NAME=value pairs. No positional parameters allowed. Unlike command-line options, " + + "unrecognized options are ignored. " + "A single-valued option set in an options file may be overridden " + + "by a subsequent command-line option. " + + "A line starting with '#' is considered a comment.", false, true, 0, Integer.MAX_VALUE, null, new String[0]); + printOptionUsage(stream, optionsFileOptionDefinition); + } + + /** + * Parse command-line options, and store values in callerOptions object passed to ctor. + * @param messageStream Where to write error messages. + * @param args Command line tokens. + * @return true if command line is valid. + */ + public boolean parseOptions(final PrintStream messageStream, final String[] args) { + this.argv = args; + this.messageStream = messageStream; + for (final String arg: args) { + if (arg.equals("-h") || arg.equals("--help")) { + usage(messageStream); + return false; + } + final String[] pair = arg.split("=", 2); + if (pair.length == 2) { + if (pair[0].equals(OPTIONS_FILE)) { + if (!parseOptionsFile(pair[1])) { + messageStream.println(); + usage(messageStream); + return false; + } + } else { + if (!parseOption(pair[0], pair[1], false)) { + messageStream.println(); + usage(messageStream); + return false; + } + } + } else if (!parsePositionalArgument(arg)) { + messageStream.println(); + usage(messageStream); + return false; + } + } + if (!checkNumArguments()) { + messageStream.println(); + usage(messageStream); + return false; + } + return true; + } + + /** + * After command line has been parsed, make sure that all required options have values, and that + * lists with minimum # of elements have sufficient. + * @return true if valid + */ + private boolean checkNumArguments() { + try { + for (final OptionDefinition optionDefinition : optionDefinitions) { + StringBuilder mutextOptionNames = new StringBuilder(); + for (String mutexOption : optionDefinition.mutuallyExclusive) { + OptionDefinition mutextOptionDef = optionMap.get(mutexOption); + if (mutextOptionDef != null && mutextOptionDef.hasBeenSet) { + mutextOptionNames.append(" ").append(mutextOptionDef.name); + } + } + if (optionDefinition.hasBeenSet && mutextOptionNames.length() > 0) { + messageStream.println("ERROR: Option '" + optionDefinition.name + + "' cannot be used in conjunction with option(s)" + + mutextOptionNames.toString()); + return false; + } + if (optionDefinition.isCollection) { + final Collection c = (Collection)optionDefinition.field.get(callerOptions); + if (c.size() < optionDefinition.minElements) { + messageStream.println("ERROR: Option '" + optionDefinition.name + "' must be specified at least " + + optionDefinition.minElements + " times."); + return false; + } + } else if (!optionDefinition.optional && !optionDefinition.hasBeenSet && mutextOptionNames.length() == 0) { + messageStream.print("ERROR: Option '" + optionDefinition.name + "' is required"); + if (optionDefinition.mutuallyExclusive.isEmpty()) { + messageStream.println("."); + } else { + messageStream.println(" unless any of " + optionDefinition.mutuallyExclusive + " are specified."); + } + return false; + } + } + if (positionalArguments != null) { + final Collection c = (Collection)positionalArguments.get(callerOptions); + if (c.size() < minPositionalArguments) { + messageStream.println("ERROR: At least " + minPositionalArguments + + " positional arguments must be specified."); + return false; + } + } + return true; + } catch (IllegalAccessException e) { + // Should never happen because lack of publicness has already been checked. + throw new RuntimeException(e); + } + } + + private boolean parsePositionalArgument(final String stringValue) { + if (positionalArguments == null) { + messageStream.println("ERROR: Invalid argument '" + stringValue + "'."); + return false; + } + final Object value; + try { + value = constructFromString(getUnderlyingType(positionalArguments), stringValue); + } catch (CommandLineParseException e) { + messageStream.println("ERROR: " + e.getMessage()); + return false; + } + final Collection c; + try { + c = (Collection)positionalArguments.get(callerOptions); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + if (c.size() >= maxPositionalArguments) { + messageStream.println("ERROR: No more than " + maxPositionalArguments + + " positional arguments may be specified on the command line."); + return false; + } + c.add(value); + return true; + } + + private boolean parseOption(String key, final String stringValue, final boolean optionsFile) { + key = key.toUpperCase(); + final OptionDefinition optionDefinition = optionMap.get(key); + if (optionDefinition == null) { + if (optionsFile) { + // Silently ignore unrecognized option from options file + return true; + } + messageStream.println("ERROR: Unrecognized option: " + key); + return false; + } + if (!optionDefinition.isCollection) { + if (optionDefinition.hasBeenSet && !optionDefinition.hasBeenSetFromOptionsFile) { + messageStream.println("ERROR: Option '" + key + "' cannot be specified more than once."); + return false; + } + } + final Object value; + try { + value = constructFromString(getUnderlyingType(optionDefinition.field), stringValue); + } catch (CommandLineParseException e) { + messageStream.println("ERROR: " + e.getMessage()); + return false; + } + try { + if (optionDefinition.isCollection) { + final Collection c = (Collection)optionDefinition.field.get(callerOptions); + if (c.size() >= optionDefinition.maxElements) { + messageStream.println("ERROR: Option '" + key + "' cannot be used more than " + + optionDefinition.maxElements + " times."); + return false; + } + c.add(value); + } else { + optionDefinition.field.set(callerOptions, value); + optionDefinition.hasBeenSet = true; + optionDefinition.hasBeenSetFromOptionsFile = optionsFile; + } + } catch (IllegalAccessException e) { + // Should never happen because we only iterate through public fields. + throw new RuntimeException(e); + } + return true; + } + + /** + * Parsing of options from file is looser than normal. Any unrecognized options are + * ignored, and a single-valued option that is set in a file may be overridden by a + * subsequent appearance of that option. + * A line that starts with '#' is ignored. + * @param optionsFile + * @return false if a fatal error occurred + */ + private boolean parseOptionsFile(final String optionsFile) { + try { + final BufferedReader reader = new BufferedReader(new FileReader(optionsFile)); + String line; + while ((line = reader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + final String[] pair = line.split("=", 2); + if (pair.length == 2) { + if (!parseOption(pair[0], pair[1], true)) { + messageStream.println(); + usage(messageStream); + return false; + } + } else { + messageStream.println("Strange line in OPTIONS_FILE " + optionsFile + ": " + line); + usage(messageStream); + return false; + } + } + reader.close(); + return true; + + } catch (IOException e) { + throw new PicardException("I/O error loading OPTIONS_FILE=" + optionsFile, e); + } + } + + private void printOptionUsage(final PrintStream stream, final OptionDefinition optionDefinition) { + final String type = getUnderlyingType(optionDefinition.field).getSimpleName(); + String optionLabel = optionDefinition.name + "=" + type; + stream.print(optionLabel); + if (optionDefinition.shortName.length() > 0) { + stream.println(); + } + if (optionDefinition.shortName.length() > 0) { + optionLabel = optionDefinition.shortName + "=" + type; + stream.print(optionLabel); + } + int numSpaces = OPTION_COLUMN_WIDTH - optionLabel.length(); + if (optionLabel.length() > OPTION_COLUMN_WIDTH) { + stream.println(); + numSpaces = OPTION_COLUMN_WIDTH; + } + printSpaces(stream, numSpaces); + final StringBuilder sb = new StringBuilder(); + if (optionDefinition.doc.length() > 0) { + sb.append(optionDefinition.doc); + sb.append(" "); + } + if (optionDefinition.optional && !optionDefinition.isCollection) { + sb.append("Default value: "); + sb.append(optionDefinition.defaultValue); + sb.append(". "); + } else if (!optionDefinition.isCollection){ + sb.append("Required. "); + } + Object[] enumConstants = getUnderlyingType(optionDefinition.field).getEnumConstants(); + if (enumConstants == null && getUnderlyingType(optionDefinition.field) == Boolean.class) { + enumConstants = TRUE_FALSE_VALUES; + } + if (enumConstants != null) { + sb.append("Possible values: {"); + for (int i = 0; i < enumConstants.length; ++i) { + if (i > 0) { + sb.append(", "); + } + sb.append(enumConstants[i].toString()); + } + sb.append("} "); + } + if (optionDefinition.isCollection) { + if (optionDefinition.minElements == 0) { + if (optionDefinition.maxElements == Integer.MAX_VALUE) { + sb.append("This option may be specified 0 or more times."); + } else { + sb.append("This option must be specified no more than " + optionDefinition.maxElements + "times."); + } + } else if (optionDefinition.maxElements == Integer.MAX_VALUE) { + sb.append("This option must be specified at least " + optionDefinition.minElements + " times."); + } else { + sb.append("This option may be specified between " + optionDefinition.minElements + + " and " + optionDefinition.maxElements + " times."); + } + } + if (!optionDefinition.mutuallyExclusive.isEmpty()) { + sb.append(" Cannot be used in conjuction with option(s)"); + for (String option : optionDefinition.mutuallyExclusive) { + OptionDefinition mutextOptionDefinition = optionMap.get(option); + sb.append(" ").append(mutextOptionDefinition.name); + if (mutextOptionDefinition.shortName.length() > 0) { + sb.append(" (").append(mutextOptionDefinition.shortName).append(")"); + } + } + } + final String wrappedDescription = StringUtil.wordWrap(sb.toString(), DESCRIPTION_COLUMN_WIDTH); + final String[] descriptionLines = wrappedDescription.split("\n"); + for (int i = 0; i < descriptionLines.length; ++i) { + if (i > 0) { + printSpaces(stream, OPTION_COLUMN_WIDTH); + } + stream.println(descriptionLines[i]); + } + stream.println(); + } + + private void printSpaces(final PrintStream stream, final int numSpaces) { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < numSpaces; ++i) { + sb.append(" "); + } + stream.print(sb); + } + + private void handleOptionAnnotation(final Field field) { + try { + final Option optionAnnotation = field.getAnnotation(Option.class); + final boolean isCollection = isCollectionField(field); + if (isCollection) { + if (optionAnnotation.maxElements() == 0) { + throw new CommandLineParserDefinitionException("@Option member " + field.getName() + + "has maxElements = 0"); + } + if (optionAnnotation.minElements() > optionAnnotation.maxElements()) { + throw new CommandLineParserDefinitionException("In @Option member " + field.getName() + + ", minElements cannot be > maxElements"); + } + } + if (!canBeMadeFromString(getUnderlyingType(field))) { + throw new CommandLineParserDefinitionException("@Option member " + field.getName() + + " must have a String ctor or be an enum"); + } + + final OptionDefinition optionDefinition = new OptionDefinition(field, + field.getName(), + optionAnnotation.shortName(), + optionAnnotation.doc(), optionAnnotation.optional() || (field.get(callerOptions) != null), + isCollection, optionAnnotation.minElements(), + optionAnnotation.maxElements(), field.get(callerOptions), + optionAnnotation.mutex()); + + for (String option : optionAnnotation.mutex()) { + OptionDefinition mutextOptionDef = optionMap.get(option); + if (mutextOptionDef != null) { + mutextOptionDef.mutuallyExclusive.add(field.getName()); + } + } + if (optionMap.containsKey(optionDefinition.name)) { + throw new CommandLineParserDefinitionException(optionDefinition.name + " has already been used"); + } + optionMap.put(optionDefinition.name, optionDefinition); + if (optionDefinition.shortName.length() > 0) { + if (optionMap.containsKey(optionDefinition.shortName)) { + throw new CommandLineParserDefinitionException(optionDefinition.shortName + " has already been used"); + } + optionMap.put(optionDefinition.shortName, optionDefinition); + } + optionDefinitions.add(optionDefinition); + } catch (IllegalAccessException e) { + throw new CommandLineParserDefinitionException(field.getName() + + " must have public visibility to have @Option annotation"); + } + } + + private void handleUsageAnnotation(final Field field) { + if (usagePreamble != null) { + throw new CommandLineParserDefinitionException + ("@Usage cannot be used more than once in an option class."); + } + try { + usagePreamble = (String)field.get(callerOptions); + final Usage usageAnnotation = field.getAnnotation(Usage.class); + if (usageAnnotation.programVersion().length() > 0) { + usagePreamble += "Version: " + usageAnnotation.programVersion() + "\n"; + } + } catch (IllegalAccessException e) { + throw new CommandLineParserDefinitionException("@Usage data member must be public"); + } catch (ClassCastException e) { + throw new CommandLineParserDefinitionException + ("@Usage can only be applied to a String data member."); + } + } + + private void handlePositionalArgumentAnnotation(final Field field) { + if (positionalArguments != null) { + throw new CommandLineParserDefinitionException + ("@PositionalArguments cannot be used more than once in an option class."); + } + positionalArguments = field; + if (!isCollectionField(field)) { + throw new CommandLineParserDefinitionException("@PositionalArguments must be applied to a Collection"); + } + + if (!canBeMadeFromString(getUnderlyingType(field))) { + throw new CommandLineParserDefinitionException("@PositionalParameters member " + field.getName() + + "does not have a String ctor"); + } + + final PositionalArguments positionalArgumentsAnnotation = field.getAnnotation(PositionalArguments.class); + minPositionalArguments = positionalArgumentsAnnotation.minElements(); + maxPositionalArguments = positionalArgumentsAnnotation.maxElements(); + if (minPositionalArguments > maxPositionalArguments) { + throw new CommandLineParserDefinitionException("In @PositionalArguments, minElements cannot be > maxElements"); + } + } + + private boolean isCollectionField(final Field field) { + try { + field.getType().asSubclass(Collection.class); + return true; + } catch (ClassCastException e) { + return false; + } + } + + private Class getUnderlyingType(final Field field) { + if (isCollectionField(field)) { + final ParameterizedType clazz = (ParameterizedType)(field.getGenericType()); + final Type[] genericTypes = clazz.getActualTypeArguments(); + if (genericTypes.length != 1) { + throw new CommandLineParserDefinitionException("Strange collection type for field " + field.getName()); + } + return (Class)genericTypes[0]; + + } else { + return field.getType(); + } + } + + // True if clazz is an enum, or if it has a ctor that takes a single String argument. + private boolean canBeMadeFromString(final Class clazz) { + if (clazz.isEnum()) { + return true; + } + try { + clazz.getConstructor(String.class); + return true; + } catch (NoSuchMethodException e) { + return false; + } + } + + private Object constructFromString(final Class clazz, final String s) { + try { + if (clazz.isEnum()) { + try { + return Enum.valueOf(clazz, s); + } catch (IllegalArgumentException e) { + throw new CommandLineParseException("'" + s + "' is not a valid value for " + + clazz.getSimpleName() + ".", e); + } + } + final Constructor ctor = clazz.getConstructor(String.class); + return ctor.newInstance(s); + } catch (NoSuchMethodException e) { + // Shouldn't happen because we've checked for presence of ctor + throw new CommandLineParseException(e); + } catch (InstantiationException e) { + throw new CommandLineParseException("Abstract class '" + clazz.getSimpleName() + + "'cannot be used for an option value type.", e); + } catch (IllegalAccessException e) { + throw new CommandLineParseException("String constructor for option value type '" + clazz.getSimpleName() + + "' must be public.", e); + } catch (InvocationTargetException e) { + throw new CommandLineParseException("Problem constructing " + clazz.getSimpleName() + " from the string '" + s + "'.", + e.getCause()); + } + } + + public String[] getArgv() { + return argv; + } + + private class OptionDefinition { + final Field field; + final String name; + final String shortName; + final String doc; + final boolean optional; + final boolean isCollection; + final int minElements; + final int maxElements; + final String defaultValue; + boolean hasBeenSet = false; + boolean hasBeenSetFromOptionsFile = false; + Set mutuallyExclusive; + + private OptionDefinition(final Field field, final String name, final String shortName, final String doc, final boolean optional, final boolean collection, + final int minElements, final int maxElements, final Object defaultValue, String[] mutuallyExclusive) { + this.field = field; + this.name = name.toUpperCase(); + this.shortName = shortName.toUpperCase(); + this.doc = doc; + this.optional = optional; + isCollection = collection; + this.minElements = minElements; + this.maxElements = maxElements; + if (defaultValue != null) { + this.defaultValue = defaultValue.toString(); + } else { + this.defaultValue = "null"; + } + this.mutuallyExclusive = new HashSet(Arrays.asList(mutuallyExclusive)); + } + } +} diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java b/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java new file mode 100644 index 0000000000..088755e2a1 --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java @@ -0,0 +1,27 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +public class CommandLineParserDefinitionException extends RuntimeException { + public CommandLineParserDefinitionException() { + } + + public CommandLineParserDefinitionException(String s) { + super(s); + } + + public CommandLineParserDefinitionException(String s, Throwable throwable) { + super(s, throwable); + } + + public CommandLineParserDefinitionException(Throwable throwable) { + super(throwable); + } +} diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java b/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java new file mode 100644 index 0000000000..10ee7635f4 --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java @@ -0,0 +1,141 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.picard.util.StringUtil; +import edu.mit.broad.picard.metrics.Header; +import edu.mit.broad.picard.metrics.StringHeader; +import edu.mit.broad.picard.metrics.MetricsFile; +import edu.mit.broad.picard.metrics.MetricBase; + +import java.io.File; +import java.util.Date; +import java.util.List; +import java.util.ArrayList; + +/** + * Abstract class to facilitate writing command-line programs. + * + * To use: + * + * 1. Extend this class with a concrete class that has data members annotated with @Option, @PositionalArguments + * and/or @Usage annotations. + * + * 2. If there is any custom command-line validation, override customCommandLineValidation(). When this method is + * called, the command line has been parsed and set into the data members of the concrete class. + * + * 3. Implement a method doWork(). This is called after successful comand-line processing. The value it returns is + * the exit status of the program. It is assumed that the concrete class emits any appropriate error message before + * returning non-zero. doWork() may throw unchecked exceptions, which are caught and reported appropriately. + * + * 4. Implement the following static method in the concrete class: + * + * public static void main(String[] argv) { + System.exit(new MyConcreteClass().instanceMain(argv)); + } + + + */ +public abstract class CommandLineProgram { + + @Option + public File TMP_DIR = new File(System.getProperty("java.io.tmpdir"), System.getProperty("user.name")); + + @Option(doc = "Control verbosity of logging") + public Log.LogLevel VERBOSITY = Log.LogLevel.INFO; + + @Option(doc = "Whether to suppress job-summary info on System.out") + public Boolean QUIET = false; + + private final String standardUsagePreamble = CommandLineParser.getStandardUsagePreamble(getClass()); + + /** + * Initialized in parseArgs. Subclasses may want to access this to do + * their own validation, and then print usage using clp. + */ + protected CommandLineParser clp; + + private final List
defaultHeaders = new ArrayList
(); + + /** + * Do the work after command line has been parsed. + * RuntimeException may be thrown by this method, and are reported appropriately. + * @return program exit status. + */ + protected abstract int doWork(); + + public int instanceMain(final String[] argv) { + // Build the default headers + final Date startDate = new Date(); + final String cmdline = getClass().getName() + " " + StringUtil.join(" ", argv); + this.defaultHeaders.add(new StringHeader(cmdline)); + this.defaultHeaders.add(new StringHeader("Started on: " + startDate)); + + if (!parseArgs(argv)) { + return 1; + } + + Log.setGlobalLogLevel(VERBOSITY); + + if (!TMP_DIR.exists()) { + // Intentially not checking the return value, because it may be that the program does not + // need a tmp_dir. If this fails, the problem will be discovered downstream. + TMP_DIR.mkdir(); + } + System.setProperty("java.io.tmpdir", TMP_DIR.getAbsolutePath()); + if (!QUIET) { + System.out.println("[" + new Date() + "] " + cmdline); + } + final int ret = doWork(); + if (!QUIET) { + System.out.println("[" + new Date() + "] " + getClass().getName() + " done."); + System.out.println("Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory()); + } + return ret; + } + + /** + * Put any custom command-line validation in an override of this method. + * clp is initialized at this point and can be used to print usage and access argv. + * Any options set by command-line parser can be validated. + * @return true if command line is valid. + */ + protected boolean customCommandLineValidation() { + return true; + } + + /** + * + * @return true if command line is valid + */ + protected boolean parseArgs(final String[] argv) { + clp = new CommandLineParser(this); + final boolean ret = clp.parseOptions(System.err, argv); + if (!ret) { + return false; + } + return customCommandLineValidation(); + } + + /** Gets a MetricsFile with default headers already written into it. */ + protected MetricsFile getMetricsFile() { + final MetricsFile file = new MetricsFile(); + for (final Header h : this.defaultHeaders) { + file.addHeader(h); + } + + return file; + } + + public String getStandardUsagePreamble() { + return standardUsagePreamble; + } +} diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java b/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java new file mode 100644 index 0000000000..0702f3bc70 --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java @@ -0,0 +1,39 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +import java.io.*; +import java.util.regex.Pattern; + +public class CommandLineUtils { + /** Regex for splitting on spaces. */ + public static final Pattern SPACE_SPLITTER = Pattern.compile(" "); + + // Regexes to split things apart on white space + public static final Pattern TAB_SPLITTER = Pattern.compile("\\t"); + + /** Checks that a file exists and is readable, and then returns a buffered reader for it. */ + public static BufferedReader getReader(File file) throws IOException { + return new BufferedReader(new InputStreamReader(getInputStream(file))); + } + + /** Checks that a file exists and is readable, and then returns a input stream for it. */ + public static InputStream getInputStream(File file) throws IOException { + if (!file.exists()) { + throw new RuntimeException("Specified file does not exist: " + file); + } + + if (!file.canRead()) { + throw new RuntimeException("Specified file is not readable: " + file); + } + + return new FileInputStream(file); + } +} diff --git a/lib/edu/mit/broad/picard/cmdline/Option.java b/lib/edu/mit/broad/picard/cmdline/Option.java new file mode 100644 index 0000000000..b7ffebdd9a --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/Option.java @@ -0,0 +1,60 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +import java.lang.annotation.Documented; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * Used to annotate which fields of a CommandLineProgram are options given at the command line. + * If a command line call looks like "cmd option=foo x=y bar baz" the CommandLineProgram + * would have annotations on fields to handle the values of option and x. All options + * must be in the form name=value on the command line. The java type of the option + * will be inferred from the type of the field or from the generic type of the collection + * if this option is allowed more than once. The type must be an enum or + * have a constructor with a single String parameter. + * + * @author Alec Wysoker + */ +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.FIELD) +@Documented +public @interface Option { + /** The name of the option as it would appear on the command line. */ + String shortName() default ""; + + /** Text that appears for this option in text describing usage of the command line program. */ + String doc() default ""; + + /** + * If set to false, an exception will be thrown if the option is not specified. + * If 2 options are mutually exclusive and both have optional=false it will be + * interpreted as one or the other is required and an exception will only be thrown if + * neither are specified. + */ + boolean optional() default false; + + /** + * Array of option names that cannot be used in conjunction with this one. + * If 2 options are mutually exclusive and both have optional=false it will be + * interpreted as one OR the other is required and an exception will only be thrown if + * neither are specified. + */ + String[] mutex() default {}; + + /** The minimum number of times that this option is required. */ + int minElements() default 0; + + /** The maximum number of times this option is allowed. */ + int maxElements() default Integer.MAX_VALUE; +} diff --git a/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java b/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java new file mode 100644 index 0000000000..f45301439a --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java @@ -0,0 +1,38 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +import java.lang.annotation.Documented; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * Used to annotate which field of a CommandLineProgram should store parameters given at the + * command line which are not options. Fields with this annotation must be a Collection + * (and probably should be a List if order is important). + * If a command line call looks like "cmd option=foo x=y bar baz" the values "bar" and "baz" + * would be added to the collection with this annotation. The java type of the arguments + * will be inferred from the generic type of the collection. The type must be an enum or + * have a constructor with a single String parameter. + * + * @author Alec Wysoker + */ +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.FIELD) +@Documented +public @interface PositionalArguments { + /** The minimum number of arguments required. */ + int minElements() default 0; + + /** The maximum number of arguments allowed. */ + int maxElements() default Integer.MAX_VALUE; +} diff --git a/lib/edu/mit/broad/picard/cmdline/Usage.java b/lib/edu/mit/broad/picard/cmdline/Usage.java new file mode 100644 index 0000000000..13aef94671 --- /dev/null +++ b/lib/edu/mit/broad/picard/cmdline/Usage.java @@ -0,0 +1,26 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.cmdline; + +import java.lang.annotation.Documented; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * Annotates the field that contains text to be displayed in a usage message. + */ +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.FIELD) +@Documented +public @interface Usage { + String programVersion() default ""; +} diff --git a/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java b/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java new file mode 100644 index 0000000000..75fb98b16b --- /dev/null +++ b/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java @@ -0,0 +1,62 @@ +package edu.mit.broad.picard.directed; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.util.BasicTextFileParser; +import edu.mit.broad.picard.util.Interval; +import edu.mit.broad.picard.util.FormatUtil; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.SAMSequenceRecord; + +import java.io.File; +import java.util.List; + +/** + * Converts an arachne style map file to the new interval list format. + * + * @author Tim Fennell + */ +public class ArachneMapToIntervalList extends CommandLineProgram { + @Option(shortName="M", doc="The path to an archne style map file") public File MAP; + @Option(shortName="SD", doc="A sequence dictionary in SAM or BAM format") public File SEQUENCE_DICTIONARY; + @Option(shortName="O", doc="The output file to write the interval list to") public File OUTPUT; + @Option(shortName="P", doc="Prefix to use when generating names") public String PREFIX; + + /** Stock main method. */ + public static void main(String[] argv) { + System.exit(new ArachneMapToIntervalList().instanceMain(argv)); + } + + protected int doWork() { + IoUtil.assertFileIsReadable(MAP); + IoUtil.assertFileIsReadable(SEQUENCE_DICTIONARY); + IoUtil.assertFileIsWritable(OUTPUT); + + SAMFileReader sam = new SAMFileReader(SEQUENCE_DICTIONARY); + SAMFileHeader header = sam.getFileHeader(); + List seqs = header.getSequences(); + IntervalList list = new IntervalList(header); + + BasicTextFileParser parser = new BasicTextFileParser(true, 3, MAP); + FormatUtil format = new FormatUtil(); + int i=1; + + while (parser.hasNext()) { + String[] fields = parser.next(); + int seqIndex = format.parseInt(fields[0]); + int start = format.parseInt(fields[1]) + 1; + int end = format.parseInt(fields[2]) + 1; + String seq = seqs.get(seqIndex).getSequenceName(); + + Interval interval = new Interval(seq, start, end, false, PREFIX + "_" + i++); + list.add(interval); + } + + list.sort(); + list.write(OUTPUT); + + return 0; + } +} diff --git a/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java b/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java new file mode 100644 index 0000000000..d3be86825f --- /dev/null +++ b/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java @@ -0,0 +1,51 @@ +package edu.mit.broad.picard.directed; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.metrics.MetricsFile; +import edu.mit.broad.sam.SAMFileReader; + +import java.io.File; + +/** + * Calculates a set of HS metrics from a sam or bam file. + * + * @author Tim Fennell + */ +public class CalculateHsMetrics extends CommandLineProgram { + @Usage public final String USAGE = + "Calculates a set of Hybrid Selection specific metrics from an aligned SAM" + + "or BAM file."; + @Option(shortName="BI") public File BAIT_INTERVALS; + @Option(shortName="TI") public File TARGET_INTERVALS; + @Option(shortName="I") public File INPUT; + @Option(shortName="M") public File METRICS_FILE; + + /** Stock main method. */ + public static void main(String[] argv) { + System.exit(new CalculateHsMetrics().instanceMain(argv)); + } + + /** + * Asserts that files are readable and writable and then fires off an + * HsMetricsCalculator instance to do the real work. + */ + protected int doWork() { + IoUtil.assertFileIsReadable(BAIT_INTERVALS); + IoUtil.assertFileIsReadable(TARGET_INTERVALS); + IoUtil.assertFileIsReadable(INPUT); + IoUtil.assertFileIsWritable(METRICS_FILE); + + HsMetricsCalculator calculator = new HsMetricsCalculator(BAIT_INTERVALS, TARGET_INTERVALS); + SAMFileReader sam = new SAMFileReader(INPUT); + calculator.analyze(sam.iterator()); + + MetricsFile metrics = getMetricsFile(); + metrics.addMetric(calculator.getMetrics()); + + metrics.write(METRICS_FILE); + return 0; + } +} diff --git a/lib/edu/mit/broad/picard/directed/GenomeMask.java b/lib/edu/mit/broad/picard/directed/GenomeMask.java new file mode 100644 index 0000000000..27be5df717 --- /dev/null +++ b/lib/edu/mit/broad/picard/directed/GenomeMask.java @@ -0,0 +1,52 @@ +package edu.mit.broad.picard.directed; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.BitSet; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * Utility class to store coordinates of interest in per-sequence bitmasks. + */ +public class GenomeMask { + + // if memory usage becomes a problem... this could be changed to a SparseBitSet + // http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html + private SortedMap data = new TreeMap(); + + + public GenomeMask() { + } + + public boolean get(int contig, int position) { + BitSet bits = data.get(contig); + return (bits != null) && bits.get(position); + } + + public BitSet get(int contig) { + return data.get(contig); + } + + /** + * Get an existing BitSet for the given contig, or create one if not already present. This is + * useful when initializing a GenomeMask from an external source. + * @param contig which BitSet + * @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size. + * @return the BitSet for the given contig, creating one if necessary + */ + public BitSet getOrCreate(int contig, int numBits) { + BitSet ret = data.get(contig); + if (ret == null) { + ret = new BitSet(numBits); + data.put(contig, ret); + } + return ret; + } + + public int getMaxContig() { + return data.lastKey(); + } +} diff --git a/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java b/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java new file mode 100644 index 0000000000..ba81a7eb6e --- /dev/null +++ b/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java @@ -0,0 +1,47 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.directed; + +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.picard.util.Interval; +import edu.mit.broad.picard.io.IoUtil; + +import java.util.List; +import java.util.BitSet; +import java.io.File; + +/** + * Create a GenomeMask from an IntervalList or a file containing an IntervalList + */ +public class GenomeMaskFactory { + + public GenomeMask makeGenomeMaskFromIntervalList(IntervalList intervalList) { + if (intervalList.getHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { + intervalList.sort(); + } + List uniqueIntervals = intervalList.getUniqueIntervals(); + GenomeMask ret = new GenomeMask(); + + SAMFileHeader samHeader = intervalList.getHeader(); + + for (Interval interval : uniqueIntervals) { + // TODO: Maybe figure out more intelligently how big the bitset might be? + BitSet bitSet = ret.getOrCreate(samHeader.getSequenceIndex(interval.getSequence()), interval.getEnd() + 1); + bitSet.set(interval.getStart(), interval.getEnd() + 1); + } + return ret; + } + + public GenomeMask makeGenomeMaskFromIntervalList(File intervalListFile) { + IoUtil.assertFileIsReadable(intervalListFile); + IntervalList intervalList = IntervalList.fromFile(intervalListFile); + return makeGenomeMaskFromIntervalList(intervalList); + } +} diff --git a/lib/edu/mit/broad/picard/directed/HsMetrics.java b/lib/edu/mit/broad/picard/directed/HsMetrics.java new file mode 100644 index 0000000000..74817f9198 --- /dev/null +++ b/lib/edu/mit/broad/picard/directed/HsMetrics.java @@ -0,0 +1,108 @@ +package edu.mit.broad.picard.directed; + +import edu.mit.broad.picard.metrics.MetricBase; + +/** + * The set of metrics captured that are specific to a hybrid selection analysis. + * + * @author Tim Fennell + */ +public class HsMetrics extends MetricBase { + /** The name of the bait set used in the hybrid selection. */ + public String BAIT_SET; + + /** The number of bases in the reference genome used for alignment. */ + public long GENOME_SIZE; + + /** The number of bases which have one or more baits on top of them. */ + public long BAIT_TERRITORY; + + /** The unique number of target bases in the experiment where target is usually exons etc. */ + public long TARGET_TERRITORY; + + /** Target terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target. */ + public double BAIT_DESIGN_EFFICIENCY; + + /** The total number of reads in the SAM or BAM file examine. */ + public int TOTAL_READS; + + /** The number of reads that pass the vendor's filter. */ + public int PF_READS; + + /** The number of PF reads that are not marked as duplicates. */ + public int PF_UNIQUE_READS; + + /** PF reads / total reads. The percent of reads passing filter. */ + public double PCT_PF_READS; + + /** PF Unique Reads / Total Reads. */ + public double PCT_PF_UQ_READS; + + /** The number of PF reads that are aligned with mapping score > 0 to the reference genome. */ + public int PF_READS_ALIGNED; + + /** PF Reads Aligned / PF Reads. */ + public double PCT_PF_READS_ALIGNED; + + /** The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps. */ + public int PF_BASES_ALIGNED; + + /** The number of PF aligned bases that mapped to a baited region of the genome. */ + public long ON_BAIT_BASES; + + /** The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region. */ + public long NEAR_BAIT_BASES; + + /** The number of PF aligned bases that mapped to neither on or near a bait. */ + public long OFF_BAIT_BASES; + + /** The number of PF aligned bases that mapped to a targetted region of the genome. */ + public long ON_TARGET_BASES; + + /** On+Near Bait Bases / PF Bases Aligned. */ + public double PCT_SELECTED_BASES; + + /** The percentage of aligned PF bases that mapped neither on or near a bait. */ + public double PCT_OFF_BAIT; + + /** The percentage of on+near bait bases that are on as opposed to near. */ + public double ON_BAIT_VS_SELECTED; + + /** The mean coverage of all baits in the experiment. */ + public double MEAN_BAIT_COVERAGE; + + /** The mean coverage of targets that recieved at least coverage depth = 2 at one base. */ + public double MEAN_TARGET_COVERAGE; + + /** The fold by which the baited region has been amplified above genomic background. */ + public double FOLD_ENRICHMENT; + + /** The number of targets that did not reach coverage=2 over any base. */ + public double ZERO_CVG_TARGETS_PCT; + + /** + * The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to + * the mean coverage level in those targets. + */ + public double FOLD_80_BASE_PENALTY; + + + /** + * Calculates the metrics in this class that can be derived from other metrics in the class. + */ + public void calculateDerivedMetrics() { + BAIT_DESIGN_EFFICIENCY = (double) TARGET_TERRITORY / (double) BAIT_TERRITORY; + + PCT_PF_READS = PF_READS / (double) TOTAL_READS; + PCT_PF_UQ_READS = PF_UNIQUE_READS / (double) TOTAL_READS; + PCT_PF_READS_ALIGNED = PF_READS_ALIGNED / (double) PF_UNIQUE_READS; + + double denominator = (ON_BAIT_BASES + NEAR_BAIT_BASES + OFF_BAIT_BASES); + + PCT_SELECTED_BASES = (ON_BAIT_BASES + NEAR_BAIT_BASES) / denominator; + PCT_OFF_BAIT = OFF_BAIT_BASES / denominator; + ON_BAIT_VS_SELECTED = ON_BAIT_BASES / (double) (ON_BAIT_BASES + NEAR_BAIT_BASES); + MEAN_BAIT_COVERAGE = ON_BAIT_BASES / (double) BAIT_TERRITORY; + FOLD_ENRICHMENT = (ON_BAIT_BASES/ denominator) / ((double) BAIT_TERRITORY / GENOME_SIZE); + } +} diff --git a/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java b/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java new file mode 100644 index 0000000000..a454642a79 --- /dev/null +++ b/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java @@ -0,0 +1,207 @@ +package edu.mit.broad.picard.directed; + +import edu.mit.broad.picard.util.*; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sam.AlignmentBlock; +import edu.mit.broad.sam.SAMSequenceRecord; + +import java.util.*; +import java.io.*; + +/** + * Calculates HS metrics for a given SAM or BAM file. Requires the input of a list of + * target intervals and a list of bait intervals. Can be invoked either on an entire + * iterator of SAMRecords or be passed SAMRecords one at a time. + * + * @author Tim Fennell + */ +public class HsMetricsCalculator { + // What is considered "near" to the bait + private static final int NEAR_BAIT_DISTANCE = 250; + private static final Log log = Log.getInstance(HsMetricsCalculator.class); + + // Holds file names and other parameter related junk + private SAMFileReader sam; + private File baitFile; + private File targetFile; + private IntervalList baits; + private IntervalList targets; + + // Overlap detector for finding overlaps between reads and the experimental targets + private OverlapDetector targetDetector = new OverlapDetector(0,0); + + // Overlap detector for finding overlaps between the reads and the baits (and the near bait space) + private OverlapDetector baitDetector = new OverlapDetector(-NEAR_BAIT_DISTANCE,0); + + // A Map to accumulate per-bait-region (i.e. merge of overlapping baits) coverage. */ + private Map coverageByTarget = new HashMap(); + + private HsMetrics metrics = new HsMetrics(); + + /** + * Constructor that parses the squashed reference to genome reference file and stores the + * information in a map for later use. + */ + public HsMetricsCalculator(File baits, File targets) { + this.baitFile = baits; + this.targetFile = targets; + this.baits = IntervalList.fromFile(baits); + this.targets = IntervalList.fromFile(targets); + + this.metrics.BAIT_SET = baits.getName(); + int tmp = this.metrics.BAIT_SET.indexOf("."); + if (tmp > 0) { + this.metrics.BAIT_SET = this.metrics.BAIT_SET.substring(0, tmp); + } + + List uniqueBaits = this.baits.getUniqueIntervals(); + this.baitDetector.addAll(uniqueBaits, uniqueBaits); + this.metrics.BAIT_TERRITORY = Interval.countBases(uniqueBaits); + + List uniqueTargets = this.targets.getUniqueIntervals(); + this.targetDetector.addAll(uniqueTargets, uniqueTargets); + this.metrics.TARGET_TERRITORY = Interval.countBases(uniqueTargets); + + for (SAMSequenceRecord seq : this.baits.getHeader().getSequences()) { + this.metrics.GENOME_SIZE += seq.getSequenceLength(); + } + + // Populate the coverage by target map + for (Interval target : this.targets.getIntervals()) { + this.coverageByTarget.put(target, new Coverage(target, 0)); + } + } + + /** Iterates over all records in the file and collects metrics. */ + public void analyze(Iterator records) { + int i = 0; + while (records.hasNext()) { + analyze(records.next()); + + if (++i % 1000000 == 0) { + log.info("Processed " + i + " records so far."); + } + } + } + + /** Adds information about an individual SAMRecord to the statistics. */ + public void analyze(SAMRecord rec) { + // Just plain avoid records that are marked as not-primary + if (rec.getNotPrimaryAlignmentFlag()) return; + + this.metrics.TOTAL_READS += 1; + + // Check for PF reads + if (rec.getReadFailsVendorQualityCheckFlag()) { + return; + } + else { + ++this.metrics.PF_READS; + } + + // Check for reads that are marked as duplicates + if (rec.getDuplicateReadFlag()) { + return; + } + else { + ++this.metrics.PF_UNIQUE_READS; + } + + // Don't bother with reads that didn't align uniquely + if (rec.getReadUnmappedFlag() || rec.getMappingQuality() == 0) { + return; + } + + this.metrics.PF_READS_ALIGNED += 1; + for (AlignmentBlock block : rec.getAlignmentBlocks()) { + this.metrics.PF_BASES_ALIGNED += block.getLength(); + } + + Interval read = new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd()); + + // Find the target overlaps + Collection targets = this.targetDetector.getOverlaps(read); + if (targets != null && !targets.isEmpty()) { + for (Interval target : targets) { + Coverage coverage = this.coverageByTarget.get(target); + + for (AlignmentBlock block : rec.getAlignmentBlocks()) { + int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength()); + for (int pos=block.getReferenceStart(); pos<=end; ++ pos) { + if (pos >= target.getStart() && pos <= target.getEnd()) { + ++this.metrics.ON_TARGET_BASES; + coverage.addBase(pos - target.getStart()); + } + } + } + } + } + + // Now do the bait overlaps + int mappedBases = 0; + for (AlignmentBlock block : rec.getAlignmentBlocks()) mappedBases += block.getLength(); + Collection baits = this.baitDetector.getOverlaps(read); + int onBaitBases = 0; + + if (baits != null && !baits.isEmpty()) { + for (Interval bait : baits) { + for (AlignmentBlock block : rec.getAlignmentBlocks()) { + int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength()); + + for (int pos=block.getReferenceStart(); pos<=end; ++pos) { + if (pos >= bait.getStart() && pos <= bait.getEnd()) ++onBaitBases; + } + } + } + + this.metrics.ON_BAIT_BASES += onBaitBases; + this.metrics.NEAR_BAIT_BASES += (mappedBases - onBaitBases); + } + else { + this.metrics.OFF_BAIT_BASES += mappedBases; + } + + } + + /** Calculates a few last summary metrics and then returns the metrics calculated. */ + public HsMetrics getMetrics() { + this.metrics.calculateDerivedMetrics(); + calculateTargetCoverageMetrics(); + return this.metrics; + } + + /** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */ + private void calculateTargetCoverageMetrics() { + short[] depths = new short[(int) this.metrics.TARGET_TERRITORY]; // may not use entire array + int zeroCoverageTargets = 0; + int depthIndex = 0; + double totalCoverage = 0; + int basesConsidered = 0; + + for (Coverage c : this.coverageByTarget.values()) { + if (!c.hasCoverage()) { + ++zeroCoverageTargets; + continue; + } + + final short[] targetDepths = c.getDepths(); + basesConsidered += targetDepths.length; + + for (short depth : targetDepths) { + depths[depthIndex++] = depth; + totalCoverage += depth; + } + } + + this.metrics.MEAN_TARGET_COVERAGE = totalCoverage / basesConsidered; + + // Sort the array (ASCENDING) and then find the base the coverage value that lies at the 80% + // line, which is actually at 20% into the array now + Arrays.sort(depths); + int indexOf80thPercentile = (depths.length - basesConsidered) + (int) (basesConsidered * 0.2); + int coverageAt80thPercentile = depths[indexOf80thPercentile]; + this.metrics.FOLD_80_BASE_PENALTY = this.metrics.MEAN_TARGET_COVERAGE / coverageAt80thPercentile; + this.metrics.ZERO_CVG_TARGETS_PCT = zeroCoverageTargets / (double) this.targets.getIntervals().size(); + } +} diff --git a/lib/edu/mit/broad/picard/directed/IntervalList.java b/lib/edu/mit/broad/picard/directed/IntervalList.java new file mode 100644 index 0000000000..087537c0a2 --- /dev/null +++ b/lib/edu/mit/broad/picard/directed/IntervalList.java @@ -0,0 +1,240 @@ +package edu.mit.broad.picard.directed; + +import edu.mit.broad.picard.util.Interval; +import edu.mit.broad.picard.util.FormatUtil; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.SAMTextHeaderCodec; +import edu.mit.broad.sam.util.StringLineReader; + +import java.util.*; +import java.io.*; + +/** + * Represents a list of intervals against a reference sequence that can be written to + * and read from a file. The file format is relatively simple and reflects the SAM + * alignment format to a degree. + * + * A SAM style header must be present in the file which lists the sequence records + * against which the intervals are described. After the header the file then contains + * records one per line in text format with the following values tab-separated: + * - Sequence name + * - Start position (1-based) + * - End position (1-based, end inclusive) + * - Strand (either + or -) + * - Interval name (an, ideally unique, name for the interval) + * + * @author Tim Fennell + */ +public class IntervalList implements Iterable { + private SAMFileHeader header; + private List intervals = new ArrayList(); + + /** Constructs a new interval list using the supplied header information. */ + public IntervalList(SAMFileHeader header) { + if (header == null) { + throw new IllegalArgumentException("SAMFileHeader must be supplied."); + } + this.header = header; + } + + /** Gets the header (if there is one) for the interval list. */ + public SAMFileHeader getHeader() { return header; } + + /** Returns an iterator over the intervals. */ + public Iterator iterator() { return this.intervals.iterator(); } + + /** Adds an interval to the list of intervals. */ + public void add(Interval interval) { this.intervals.add(interval); } + + /** Sorts the internal collection of intervals by coordinate. */ + public void sort() { + Collections.sort(this.intervals, new IntervalCoordinateComparator(this.header)); + this.header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + } + + /** Gets the set of intervals as held internally. */ + public List getIntervals() { + return Collections.unmodifiableList(this.intervals); + } + + /** + * Merges the list of intervals and then reduces them down where regions overlap + * or are directly adjacent to one another. During this process the "merged" interval + * will retain the strand and name of the 5' most interval merged. + * + * @return the set of unique intervals condensed from the contained intervals + */ + public List getUniqueIntervals() { + List unique = new ArrayList(); + ListIterator iterator = this.intervals.listIterator(); + Interval previous = iterator.next(); + + while (iterator.hasNext()) { + Interval next = iterator.next(); + if (previous.intersects(next) || previous.abuts(next)) { + previous = new Interval(previous.getSequence(), + previous.getStart(), + Math.max(previous.getEnd(), next.getEnd()), + previous.isNegativeStrand(), + previous.getName()); + } + else { + unique.add(previous); + previous = next; + } + } + + if (previous != null) unique.add(previous); + + return unique; + } + + /** Gets the (potentially redundant) sum of the length of the intervals in the list. */ + public long getBaseCount() { + return Interval.countBases(this.intervals); + } + + /** Gets the count of unique bases represented by the intervals in the list. */ + public long getUniqueBaseCount() { + return Interval.countBases(getUniqueIntervals()); + } + + /** + * Parses an interval list from a file. + * @param file the file containing the intervals + * @return an IntervalList object that contains the headers and intervals from the file + */ + public static IntervalList fromFile(File file) { + BufferedReader in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file))); + + try { + // Setup a reader and parse the header + StringBuilder builder = new StringBuilder(4096); + String line = null; + + while ((line = in.readLine()) != null) { + if (line.startsWith("@")) { + builder.append(line).append('\n'); + } + else { + break; + } + } + + if (builder.length() == 0) { + throw new IllegalStateException("Interval list file must contain header: " + file.getAbsolutePath()); + } + + StringLineReader headerReader = new StringLineReader(builder.toString()); + SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); + IntervalList list = new IntervalList(codec.decode(headerReader, file)); + + // Then read in the intervals + FormatUtil format = new FormatUtil(); + do { + if (line.trim().length() == 0) continue; // skip over blank lines + + // Make sure we have the right number of fields + String fields[] = line.split("\t"); + if (fields.length != 5) { + throw new PicardException("Invalid interval record contains " + + fields.length + " fields: " + line); + } + + // Then parse them out + String seq = fields[0]; + int start = format.parseInt(fields[1]); + int end = format.parseInt(fields[2]); + + boolean negative; + if (fields[3].equals("-")) negative = true; + else if (fields[3].equals("+")) negative = false; + else throw new IllegalArgumentException("Invalid strand field: " + fields[3]); + + String name = fields[4]; + + Interval interval = new Interval(seq, start, end, negative, name); + list.intervals.add(interval); + } + while ((line = in.readLine()) != null); + + return list; + } + catch (IOException ioe) { + throw new PicardException("Error parsing interval list file: " + file.getAbsolutePath(), ioe); + } + finally { + try { in.close(); } catch (Exception e) { /* do nothing */ } + } + } + + /** + * Writes out the list of intervals to the supplied file. + * @param file a file to write to. If exists it will be overwritten. + */ + public void write(File file) { + try { + BufferedWriter out = new BufferedWriter(new OutputStreamWriter(IoUtil.openFileForWriting(file))); + FormatUtil format = new FormatUtil(); + + // Write out the header + if (this.header != null) { + SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); + codec.encode(out, this.header); + } + + // Write out the intervals + for (Interval interval : this) { + out.write(interval.getSequence()); + out.write('\t'); + out.write(format.format(interval.getStart())); + out.write('\t'); + out.write(format.format(interval.getEnd())); + out.write('\t'); + out.write(interval.isPositiveStrand() ? '+' : '-'); + out.write('\t'); + out.write(interval.getName()); + out.newLine(); + } + + out.flush(); + out.close(); + } + catch (IOException ioe) { + throw new PicardException("Error writing out interval list to file: " + file.getAbsolutePath(), ioe); + } + } +} + +/** + * Comparator that orders intervals based on their sequence index, by coordinate + * then by strand and finally by name. + */ +class IntervalCoordinateComparator implements Comparator { + private SAMFileHeader header; + + /** Constructs a comparator using the supplied sequence header. */ + IntervalCoordinateComparator(SAMFileHeader header) { + this.header = header; + } + + public int compare(Interval lhs, Interval rhs) { + int lhsIndex = this.header.getSequenceIndex(lhs.getSequence()); + int rhsIndex = this.header.getSequenceIndex(rhs.getSequence()); + int retval = lhsIndex - rhsIndex; + + if (retval == 0) retval = lhs.getStart() - rhs.getStart(); + if (retval == 0) retval = lhs.getEnd() - rhs.getEnd(); + if (retval == 0) { + if (lhs.isPositiveStrand() && rhs.isNegativeStrand()) retval = -1; + else if (lhs.isNegativeStrand() && rhs.isPositiveStrand()) retval = 1; + } + if (retval == 0) { + retval = lhs.getName().compareTo(rhs.getName()); + } + + return retval; + } +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/filter/AggregateFilter.java b/lib/edu/mit/broad/picard/filter/AggregateFilter.java new file mode 100644 index 0000000000..3ee558c992 --- /dev/null +++ b/lib/edu/mit/broad/picard/filter/AggregateFilter.java @@ -0,0 +1,46 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.filter; + +import edu.mit.broad.sam.SAMRecord; + +import java.util.List; + +/** + * Aggregates multiple filters and provides a method for applying them all to a given record with + * one method call. + */ +public class AggregateFilter implements SamRecordFilter { + + private final List filters; + + /** + * Constructor + * @param filters the list of filters that this Aggregator applies + */ + public AggregateFilter(List filters) { + this.filters = filters; + } + + /** + * Determines whether a SAMRecord matches this filter + * + * @param record the SAMRecord to evaluate + * @return true if the SAMRecord matches at least one filter, otherwise false + */ + public boolean filterOut(SAMRecord record) { + for (SamRecordFilter filter : filters) { + if (filter.filterOut(record)) { + return true; + } + } + return false; + } +} diff --git a/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java b/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java new file mode 100644 index 0000000000..3e0c9bb3ff --- /dev/null +++ b/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java @@ -0,0 +1,28 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.filter; + +import edu.mit.broad.sam.SAMRecord; + +/** + * Filter for filtering out reads that do not pass the quality filter + */ +public class FailsVendorReadQualityFilter implements SamRecordFilter { + + /** + * Determines whether a SAMRecord matches this filter + * + * @param record the SAMRecord to evaluate + * @return true if the SAMRecord matches the filter, otherwise false + */ + public boolean filterOut(SAMRecord record) { + return record.getReadFailsVendorQualityCheckFlag(); + } +} diff --git a/lib/edu/mit/broad/picard/filter/FilteringIterator.java b/lib/edu/mit/broad/picard/filter/FilteringIterator.java new file mode 100644 index 0000000000..375036394c --- /dev/null +++ b/lib/edu/mit/broad/picard/filter/FilteringIterator.java @@ -0,0 +1,94 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.filter; + +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.picard.util.CloserUtil; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * Filtering Iterator which takes a filter and an iterator and iterates + * through only those records which are not rejected by the filter. + * + * @author Kathleen Tibbetts + */ +public class FilteringIterator implements CloseableIterator { + + private final Iterator iterator; + private final SamRecordFilter filter; + private SAMRecord next = null; + + /** + * Constructor + * + * @param iterator the backing iterator + * @param filter the filter (which may be a FilterAggregator) + */ + public FilteringIterator(Iterator iterator, SamRecordFilter filter) { + this.iterator = iterator; + this.filter = filter; + next = getNextRecord(); + } + + /** + * Returns true if the iteration has more elements. + * + * @return true if the iteration has more elements. Otherwise returns false. + */ + public boolean hasNext() { + return next != null; + } + + /** + * Returns the next element in the iteration. + * + * @return the next element in the iteration + * @throws java.util.NoSuchElementException + */ + public SAMRecord next() { + if (next == null) { + throw new NoSuchElementException("Iterator has no more elements."); + } + SAMRecord result = next; + next = getNextRecord(); + return result; + } + + /** + * Required method for Iterator API. + * + * @throws UnsupportedOperationException + */ + public void remove() { + throw new UnsupportedOperationException("Remove() not supported by FilteringIterator"); + } + + public void close() { + CloserUtil.close(iterator); + } + + /** + * Gets the next record from the underlying iterator that passes the filter + * + * @return SAMRecord the next filter-passing record + */ + private SAMRecord getNextRecord() { + while (iterator.hasNext()) { + SAMRecord record = iterator.next(); + if (!filter.filterOut(record)) { + return next; + } + } + return null; + } +} diff --git a/lib/edu/mit/broad/picard/filter/SamRecordFilter.java b/lib/edu/mit/broad/picard/filter/SamRecordFilter.java new file mode 100644 index 0000000000..d8936ca8aa --- /dev/null +++ b/lib/edu/mit/broad/picard/filter/SamRecordFilter.java @@ -0,0 +1,26 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.filter; + +import edu.mit.broad.sam.SAMRecord; + +/** + * API for filtering SAMRecords + */ +public interface SamRecordFilter { + + /** + * Determines whether a SAMRecord matches this filter + * + * @param record the SAMRecord to evaluate + * @return true if the SAMRecord matches the filter, otherwise false + */ + public boolean filterOut(SAMRecord record); +} diff --git a/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java b/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java new file mode 100644 index 0000000000..9969ae2e3a --- /dev/null +++ b/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java @@ -0,0 +1,37 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.filter; + +import edu.mit.broad.picard.util.SequenceUtil; +import edu.mit.broad.sam.SAMRecord; + +/** + * Filter to determine whether a read is "noisy" due to a poly-A run that is a sequencing artifact. + * Currently we filter out only reads that are composed entirely of As. + */ +public class SolexaNoiseFilter implements SamRecordFilter { + + /** + * Determines whether a SAMRecord matches this filter + * + * @param record the SAMRecord to evaluate + * @return true if the SAMRecord matches the filter, otherwise false + */ + public boolean filterOut(SAMRecord record) { + byte sequence[] = record.getReadBases(); + for (byte base : sequence) { + if (base != 'A' && base != 'a' && + !SequenceUtil.isNoCall(base)) { + return false; + } + } + return true; + } +} diff --git a/lib/edu/mit/broad/picard/filter/TagFilter.java b/lib/edu/mit/broad/picard/filter/TagFilter.java new file mode 100644 index 0000000000..f35957ba09 --- /dev/null +++ b/lib/edu/mit/broad/picard/filter/TagFilter.java @@ -0,0 +1,56 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.filter; + +import edu.mit.broad.sam.SAMRecord; + +import java.util.List; +import java.util.Arrays; + +/** + * Filter class for matching tag attributes in SAMRecords + */ +public class TagFilter implements SamRecordFilter { + + private final String tag; // The key of the tag to match + private final List values; // The list of matching values + + /** + * Constructor for a single value + * + * @param tag the key of the tag to match + * @param value the value to match + */ + public TagFilter(String tag, Object value) { + this.tag = tag; + this.values = Arrays.asList(value); + } + + /** + * Constructor for multiple values + * + * @param tag the key of the tag to match + * @param values the matching values + */ + public TagFilter(String tag, List values) { + this.tag = tag; + this.values = values; + } + + /** + * Determines whether a SAMRecord matches this filter + * + * @param record the SAMRecord to evaluate + * @return true if the SAMRecord matches the filter, otherwise false + */ + public boolean filterOut(SAMRecord record) { + return values.contains(record.getAttribute(tag)); + } + } diff --git a/lib/edu/mit/broad/picard/genotype/GeliException.java b/lib/edu/mit/broad/picard/genotype/GeliException.java new file mode 100644 index 0000000000..5d6fed76c1 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/GeliException.java @@ -0,0 +1,30 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.genotype; + +import edu.mit.broad.picard.PicardException; + +/** + * Generic exception thrown by GELI format machinery. + * + * @author Doug Voet + */ +public class GeliException extends PicardException { + + public GeliException(String message, Throwable throwable) { + super(message, throwable); + } + + public GeliException(String message) { + super(message); + } + +} diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java b/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java new file mode 100644 index 0000000000..6f14962511 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java @@ -0,0 +1,20 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.genotype; + +/** + * Misc constants for GELI format + * + * @author Doug Voet + */ +public interface GeliFileConstants { + public static final byte[] GELI_MAGIC = "GELI".getBytes(); +} diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileReader.java b/lib/edu/mit/broad/picard/genotype/GeliFileReader.java new file mode 100644 index 0000000000..de72b1639d --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/GeliFileReader.java @@ -0,0 +1,103 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.picard.genotype; + + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.util.BlockCompressedInputStream; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.RuntimeIOException; + + +/** + * Class for reading GELI (GEnotype LIkelihood) files. + * + * @author Doug Voet + */ +public class GeliFileReader implements Iterable +{ + private ReaderImplementation mReader = null; + + /** + * Internal interface for SAM/BAM file reader implementations. + * Implemented as an abstract class to enforce better access control. + */ + static abstract class ReaderImplementation { + abstract SAMFileHeader getFileHeader(); + abstract CloseableIterator getIterator(); + abstract void close(); + } + + + public GeliFileReader(final InputStream stream) { + try { + final BufferedInputStream bufferedStream = toBufferedStream(stream); + if (isValidGELIFile(bufferedStream)) { + mReader = new GeliFileReaderImplementation(bufferedStream); + } else { + throw new GeliException("Unrecognized file format"); + } + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public GeliFileReader(final File file) { + try { + final BufferedInputStream bufferedStream = + new BufferedInputStream(new FileInputStream(file)); + if (isValidGELIFile(bufferedStream)) { + bufferedStream.close(); + final GeliFileReaderImplementation reader = new GeliFileReaderImplementation(file); + mReader = reader; + } else { + bufferedStream.close(); + throw new GeliException("Unrecognized file format"); + } + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public void close() { + if (mReader != null) { + mReader.close(); + } + mReader = null; + } + + public SAMFileHeader getFileHeader() { + return mReader.getFileHeader(); + } + + public CloseableIterator iterator() { + return mReader.getIterator(); + } + + private boolean isValidGELIFile(final InputStream stream) + throws IOException { + return BlockCompressedInputStream.isValidFile(stream); + } + + private BufferedInputStream toBufferedStream(final InputStream stream) { + if (stream instanceof BufferedInputStream) { + return (BufferedInputStream) stream; + } else { + return new BufferedInputStream(stream); + } + } +} diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java b/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java new file mode 100644 index 0000000000..7f544532ee --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java @@ -0,0 +1,189 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.picard.genotype; + + +import java.io.DataInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.LineNumberReader; +import java.io.StringReader; +import java.util.Arrays; + +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.sam.SAMTextHeaderCodec; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.BlockCompressedInputStream; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.StringLineReader; + +/** + * Internal class for reading GELI files. + */ +class GeliFileReaderImplementation extends GeliFileReader.ReaderImplementation { + + private boolean mIsSeekable = false; + private BinaryCodec mStream = null; + private final BlockCompressedInputStream mCompressedInputStream; + private SAMFileHeader mFileHeader = null; + private long mFirstRecordPointer = 0; + private CloseableIterator mCurrentIterator = null; + + + GeliFileReaderImplementation(final InputStream stream) + throws IOException { + mIsSeekable = false; + mCompressedInputStream = new BlockCompressedInputStream(stream); + mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); + readHeader(null); + } + + GeliFileReaderImplementation(final File file) + throws IOException { + mIsSeekable = true; + mCompressedInputStream = new BlockCompressedInputStream(file); + mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); + readHeader(file); + mFirstRecordPointer = mCompressedInputStream.getFilePointer(); + } + + void close() { + if (mStream != null) { + mStream.close(); + } + mStream = null; + mFileHeader = null; + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + CloseableIterator getIterator() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mCompressedInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + mCurrentIterator = new GELIFileIterator(); + return mCurrentIterator; + } + + private void readHeader(final File file) + throws IOException { + + final byte[] buffer = new byte[4]; + mStream.readBytes(buffer); + if (!Arrays.equals(buffer, GeliFileConstants.GELI_MAGIC)) { + throw new IOException("Invalid GELI file header"); + } + + final int headerTextLength = mStream.readInt(); + final String textHeader = mStream.readString(headerTextLength); + mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader), + file); + + final int sequenceCount = mStream.readInt(); + if (sequenceCount != mFileHeader.getSequences().size()) { + throw new GeliException("Number of sequences in text header (" + mFileHeader.getSequences().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + file); + } + for (int i = 0; i < sequenceCount; i++) { + readSequenceRecord(file); +// final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); +// if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { +// throw new GELIException("For sequence " + i + ", text and binary have different names in file " + +// file); +// } +// if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { +// throw new GELIException("For sequence " + i + ", text and binary have different lengths in file " + +// file); +// } + } + } + + private SAMSequenceRecord readSequenceRecord(final File file) { + final int nameLength = mStream.readInt(); + if (nameLength <= 1) { + throw new GeliException("Invalid BAM file header: missing sequence name in file " + file); + } + final String sequenceName = mStream.readString(nameLength - 1); + // Skip the null terminator + mStream.readByte(); + final int sequenceLength = mStream.readInt(); + final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName); + record.setSequenceLength(sequenceLength); + return record; + } + + private class GELIFileIterator + implements CloseableIterator { + + private GenotypeLikelihoods mNextRecord = null; + private final GenotypeLikelihoodsCodec likelihoodsCodec = new GenotypeLikelihoodsCodec(); + + + GELIFileIterator() { + this(true); + } + + GELIFileIterator(final boolean advance) { + likelihoodsCodec.setInputStream(mStream.getInputStream()); + if (advance) { + advance(); + } + } + + public void close() { + if (this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + public boolean hasNext() { + return (mNextRecord != null); + } + + public GenotypeLikelihoods next() { + final GenotypeLikelihoods result = mNextRecord; + advance(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + void advance() { + try { + mNextRecord = getNextRecord(); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + GenotypeLikelihoods getNextRecord() + throws IOException { + return likelihoodsCodec.decode(); + } + } +} diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java b/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java new file mode 100644 index 0000000000..84196b2392 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java @@ -0,0 +1,168 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.genotype; + +import java.io.DataOutputStream; +import java.io.File; +import java.io.StringWriter; + +import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.sam.SAMTextHeaderCodec; +import edu.mit.broad.sam.SAMFileHeader.SortOrder; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.BlockCompressedOutputStream; +import edu.mit.broad.sam.util.SortingCollection; + +/** + * Class for writing GELI (GEnotype LIkelihood) files. + */ +public class GeliFileWriter { + private static final int MAX_RECORDS_IN_RAM = 1000000; + private SAMFileHeader.SortOrder sortOrder = SortOrder.coordinate; + private SAMFileHeader header; + private SortingCollection likelihoodsSorter; + + // These two fields are for validating presorted records. + private GenotypeLikelihoods prevLikelihoods; + private GenotypeLikelihoodsComparator presortedComparator; + + // If true, records passed to addAlignment are already in the order specified by sortOrder + private boolean presorted; + protected final BinaryCodec outputBinaryCodec; + private GenotypeLikelihoodsCodec genotypeLikelihoodsCodec = null; + + public GeliFileWriter(final File path) { + this(path, false); + } + + public GeliFileWriter(final File path, boolean presorted) { + outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(path))); + outputBinaryCodec.setOutputFileName(path.toString()); + this.presorted = presorted; + } + + /** + * Must be called before addAlignment. + * @param header + */ + public void setHeader(final SAMFileHeader header) + { + this.header = header; + header.setSortOrder(sortOrder); + final StringWriter headerTextBuffer = new StringWriter(); + new SAMTextHeaderCodec().encode(headerTextBuffer, header); + final String headerText = headerTextBuffer.toString(); + + writeHeader(headerText); + + if (presorted) { + presortedComparator = makeComparator(); + } else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { + likelihoodsSorter = SortingCollection.newInstance(GenotypeLikelihoods.class, + new GenotypeLikelihoodsCodec(), makeComparator(), MAX_RECORDS_IN_RAM); + } + } + + protected SAMFileHeader getHeader() { + return header; + } + + private GenotypeLikelihoodsComparator makeComparator() { + return new GenotypeLikelihoodsComparator(); + } + + public void addGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) + { + if (presorted) { + assertPresorted(genotypeLikelihoods); + writeGenotypeLikelihoods(genotypeLikelihoods); + } else { + likelihoodsSorter.add(genotypeLikelihoods); + } + } + + private void assertPresorted(final GenotypeLikelihoods genotypeLikelihoods) { + if (prevLikelihoods != null) { + if (presortedComparator.compare(prevLikelihoods, genotypeLikelihoods) > 0) { + throw new IllegalArgumentException("GenotypeLikelihoods added out of order in GELIFileWriterImpl.addGenotypeLikelihoods for " + + getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at [" + + prevLikelihoods.getReferenceIndex() + ":" + prevLikelihoods.getPosition() + "] and [" + + genotypeLikelihoods.getReferenceIndex() + ":" + genotypeLikelihoods.getPosition() + "]"); + } + } + prevLikelihoods = genotypeLikelihoods; + } + + public final void close() + { + if (likelihoodsSorter != null) { + for (final GenotypeLikelihoods genotypeLikelihoods : likelihoodsSorter) { + writeGenotypeLikelihoods(genotypeLikelihoods); + } + likelihoodsSorter.cleanup(); + } + finish(); + } + + private void prepareToWriteAlignments() { + if (genotypeLikelihoodsCodec == null) { + genotypeLikelihoodsCodec = new GenotypeLikelihoodsCodec(); + genotypeLikelihoodsCodec.setOutputStream(outputBinaryCodec.getOutputStream()); + } + } + + /** + * Writes the record to disk. Sort order has been taken care of by the time + * this method is called. + * @param alignment + */ + protected void writeGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) { + prepareToWriteAlignments(); + genotypeLikelihoodsCodec.encode(genotypeLikelihoods); + } + + /** + * Write the header to disk. Header object is available via getHeader(). + * @param textHeader for convenience if the implementation needs it. + */ + protected void writeHeader(final String textHeader) { + outputBinaryCodec.writeBytes(GeliFileConstants.GELI_MAGIC); + + // calculate and write the length of the SAM file header text and the header text + outputBinaryCodec.writeInt(textHeader.length()); + outputBinaryCodec.writeBytes(textHeader.getBytes()); + + // write the sequences binarily. This is redundant with the text header + outputBinaryCodec.writeInt(getHeader().getSequences().size()); + for (final SAMSequenceRecord sequenceRecord: getHeader().getSequences()) { + outputBinaryCodec.writeInt(sequenceRecord.getSequenceName().length() + 1); + outputBinaryCodec.writeBytes(sequenceRecord.getSequenceName().getBytes()); + outputBinaryCodec.writeByte(0); + outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength()); + } + } + + /** + * Do any required flushing here. + */ + protected void finish() { + outputBinaryCodec.close(); + } + + /** + * For producing error messages. + * @return Output filename, or null if there isn't one. + */ + protected String getFilename() { + return outputBinaryCodec.getOutputFileName(); + } +} diff --git a/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java b/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java new file mode 100644 index 0000000000..d19a637c44 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java @@ -0,0 +1,164 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.genotype; + +import java.util.Arrays; +import java.util.Comparator; + +/** + * Data object for Genotype Likelihoods. One object represents one row in a GELI file. + * + * @author Doug Voet + */ +public class GenotypeLikelihoods { + /** this is a guess at how much memory an instance of this object occupies */ + public static final int OBJECT_SIZE_BYTES = 150; + + public static final int AA_GENOTYPE = 0; + public static final int AC_GENOTYPE = 1; + public static final int AG_GENOTYPE = 2; + public static final int AT_GENOTYPE = 3; + public static final int CC_GENOTYPE = 4; + public static final int CG_GENOTYPE = 5; + public static final int CT_GENOTYPE = 6; + public static final int GG_GENOTYPE = 7; + public static final int GT_GENOTYPE = 8; + public static final int TT_GENOTYPE = 9; + + private static final char[][] GENOTYPES = { + "AA".toCharArray(), + "AC".toCharArray(), + "AG".toCharArray(), + "AT".toCharArray(), + "CC".toCharArray(), + "CG".toCharArray(), + "CT".toCharArray(), + "GG".toCharArray(), + "GT".toCharArray(), + "TT".toCharArray() + }; + + /** compares first by reference index then by position */ + public static class GenotypeLikelihoodsComparator implements Comparator { + @Override + public int compare(GenotypeLikelihoods thing1, GenotypeLikelihoods thing2) { + long refCompare = thing1.referenceIndex - thing2.referenceIndex; + if (refCompare == 0) { + long posCompare = thing1.position - thing2.position; + return (int) posCompare; + } else { + return (int) refCompare; + } + } + } + + + private long referenceIndex; + private long position; + private byte referenceBase; + private int numReads; + private short maxMappingQuality; + private float[] likelihoods = new float[10]; + private byte bestLikelihoodIndex = -1; // stored as byte to reduce memory footprint + private byte secondBestLikelihoodIndex = -1; // stored as byte to reduce memory footprint + + public static int getLikelihoodIndex(char[] genotype) { + char first = Character.isLowerCase(genotype[0]) ? Character.toUpperCase(genotype[0]) : genotype[0]; + char second = Character.isLowerCase(genotype[1]) ? Character.toUpperCase(genotype[1]) : genotype[1]; + if (first > second) { + char temp = first; + first = second; + second = temp; + } + for (int i=0; i>> 32)); + result = prime * result + referenceBase; + result = prime * result + (int) (referenceIndex ^ (referenceIndex >>> 32)); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + GenotypeLikelihoods other = (GenotypeLikelihoods) obj; + if (!Arrays.equals(likelihoods, other.likelihoods)) + return false; + if (maxMappingQuality != other.maxMappingQuality) + return false; + if (numReads != other.numReads) + return false; + if (position != other.position) + return false; + if (referenceBase != other.referenceBase) + return false; + if (referenceIndex != other.referenceIndex) + return false; + return true; + } + + public long getReferenceIndex() { return referenceIndex; } + public void setReferenceIndex(long sequenceIndex) { this.referenceIndex = sequenceIndex; } + public long getPosition() { return position; } + public void setPosition(long position) { this.position = position; } + public byte getReferenceBase() { return referenceBase; } + public void setReferenceBase(byte referenceBase) { this.referenceBase = referenceBase; } + public int getNumReads() { return numReads; } + public void setNumReads(int numReads) { this.numReads = numReads; } + public short getMaxMappingQuality() { return maxMappingQuality; } + public void setMaxMappingQuality(short maxMappingQuality) { this.maxMappingQuality = maxMappingQuality; } + float[] getLikelihoods() { return likelihoods; } + public int getBestLikelihoodIndex() { return bestLikelihoodIndex; } + public void setBestLikelihoodIndex(int bestLikelihoodIndex) { this.bestLikelihoodIndex = (byte) bestLikelihoodIndex; } + public int getSecondBestLikelihoodIndex() { return secondBestLikelihoodIndex; } + public void setSecondBestLikelihoodIndex(int secondBestLikelihoodIndex) { this.secondBestLikelihoodIndex = (byte) secondBestLikelihoodIndex; } +} diff --git a/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java b/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java new file mode 100644 index 0000000000..aa06799415 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java @@ -0,0 +1,126 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.genotype; + +import java.io.InputStream; +import java.io.OutputStream; + +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.RuntimeEOFException; +import edu.mit.broad.sam.util.SortingCollection; + +public class GenotypeLikelihoodsCodec implements SortingCollection.Codec { + private static final int SIG_FIG_MULTIPLIER = 100; + private static final short BLOCK_SIZE = 12 + 10 * 4; + + private OutputStream os; + private InputStream is; + private BinaryCodec binaryCodec; + + /** Returns a new genotype likelihood codec. */ + public SortingCollection.Codec clone() { + return new GenotypeLikelihoodsCodec(); + } + + /** + * Write object to OutputStream. + * + * @param genotypeLikelihoods what to write + */ + public void encode(final GenotypeLikelihoods genotypeLikelihoods) { + this.binaryCodec.writeShort(BLOCK_SIZE); + this.binaryCodec.writeUInt(genotypeLikelihoods.getReferenceIndex()); + this.binaryCodec.writeUInt(genotypeLikelihoods.getPosition()); + this.binaryCodec.writeByte(genotypeLikelihoods.getReferenceBase()); + this.binaryCodec.writeUShort(genotypeLikelihoods.getNumReads()); + this.binaryCodec.writeByte(genotypeLikelihoods.getMaxMappingQuality()); + + for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) { + writeLikelihood(genotypeLikelihoods.getLikelihoods()[i]); + } + } + + /** + * Read the next record from the input stream and convert into a java object. + * + * @return null if no more records. Should throw exception if EOF is encountered in the middle of + * a record. + */ + public GenotypeLikelihoods decode() { + int recordLength = 0; + try { + recordLength = this.binaryCodec.readShort(); + } catch (RuntimeEOFException e) { + return null; + } + if (recordLength != BLOCK_SIZE) { + throw new GeliException("Invalid record length: " + recordLength); + } + + final GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods(); + genotypeLikelihoods.setReferenceIndex(this.binaryCodec.readUInt()); + genotypeLikelihoods.setPosition(this.binaryCodec.readUInt()); + genotypeLikelihoods.setReferenceBase(this.binaryCodec.readByte()); + genotypeLikelihoods.setNumReads(this.binaryCodec.readUShort()); + genotypeLikelihoods.setMaxMappingQuality(this.binaryCodec.readByte()); + + int bestIndex = -1; + int secondBestIndex = -1; + for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) { + float likelihood = readLikelihood(); + genotypeLikelihoods.getLikelihoods()[i] = likelihood; + + if (bestIndex == -1 || genotypeLikelihoods.getLikelihood(bestIndex) < likelihood) { + secondBestIndex = bestIndex; + bestIndex = i; + } else if (secondBestIndex == -1 || genotypeLikelihoods.getLikelihood(secondBestIndex) < likelihood) { + secondBestIndex = i; + } + } + genotypeLikelihoods.setBestLikelihoodIndex(bestIndex); + genotypeLikelihoods.setSecondBestLikelihoodIndex(secondBestIndex); + + return genotypeLikelihoods; + } + + /** + * Where to write encoded output + * + * @param os + */ + public void setOutputStream(final OutputStream os) { + this.os = os; + this.binaryCodec = new BinaryCodec(os); + } + + /** + * Where to read encoded input from + * + * @param is + */ + public void setInputStream(final InputStream is) { + this.is = is; + this.binaryCodec = new BinaryCodec(is); + } + + private void writeLikelihood(float likelihood) { + float shiftedLikelihood = likelihood * SIG_FIG_MULTIPLIER; + this.binaryCodec.writeInt((int) Math.round(shiftedLikelihood)); + } + + /** + * @return + */ + private float readLikelihood() { + float likelihood = (float) this.binaryCodec.readInt() / SIG_FIG_MULTIPLIER; + return likelihood; + } + +} diff --git a/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java b/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java new file mode 100644 index 0000000000..3893e7bd1f --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java @@ -0,0 +1,192 @@ +package edu.mit.broad.picard.genotype.caller; + +import edu.mit.broad.picard.sam.SamLocusIterator; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.PicardException; + +import java.io.IOException; +import java.io.BufferedWriter; +import java.io.File; +import java.util.SortedSet; +import java.util.List; + +/** + * Base class for AlleleCallers. Handles efficient access to the reference, output of data to a + * standard file format, and application of priors + */ +public abstract class AbstractAlleleCaller { + // writer for output + private final BufferedWriter writer; + + // for providing access to reference data + private final ReferenceSequenceFile referenceSequenceFile; + private final SAMFileHeader samHeader; + private ReferenceSequence referenceSequence; + + public AbstractAlleleCaller(final File reference, final SAMFileHeader samHeader, final BufferedWriter writer) { + this.writer = writer; + this.referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference); + this.samHeader = samHeader; + } + + + /** + * emit allele calls to the writer specified in the constructor + * + * @param li Locus to call + */ + public void callAlleles(final SamLocusIterator.LocusInfo li) throws IOException { + + + cacheReferenceSequence(li.getSequenceIndex()); + + final char ref = Character.toUpperCase((char)(referenceSequence.getBases()[li.getPosition() - 1] & 0xff)); + + + // delegate to the specific implementation + final SortedSet likelihoods = call(ref, li.getBasesAsString(), li.getQualities()); + + + final GenotypeTheory bestTheory = likelihoods.first(); + GenotypeTheory nextBestTheory = null; + GenotypeTheory refTheory = null; + final String refString = new String(new char[]{ref,ref}); + final DiploidGenotype refGenotype = DiploidGenotype.valueOf(refString); + + + final StringBuilder theoryString = new StringBuilder(); + int k=0; + for(final GenotypeTheory t : likelihoods) { + if (k == 1) { nextBestTheory = t; } + if (t.getGenotype() == refGenotype) { refTheory = t; } + + theoryString.append(t.getGenotype()) + .append(":") + .append(String.format("%.2f",t.getLikelihood())) + .append(" "); + k++; + } + + final double btnb = bestTheory.getLikelihood() - nextBestTheory.getLikelihood(); + final double btr = bestTheory.getLikelihood() - refTheory.getLikelihood(); + + final DiploidGenotype gt = likelihoods.first().getGenotype(); + + final String type; + if (!gt.isHet() && gt.getAllele1() == ref) { + type = "homozygous"; + } else if (!gt.isHet() && gt.getAllele1() != ref) { + type = "homozygous-SNP"; + } else { + type = "heterozygous-SNP"; + } + + final String bases = li.getBasesAsString(); + int a = 0,c = 0,g = 0,t = 0; + for(int i=0; i= the arg in the previous + * call to this method. + */ + private void cacheReferenceSequence(int sequenceIndex) { + if (referenceSequence != null && referenceSequence.getContigIndex() == sequenceIndex) { + return; + } + referenceSequence = null; + for(referenceSequence = referenceSequenceFile.nextSequence(); + referenceSequence != null; + referenceSequence = referenceSequenceFile.nextSequence()) { + // Sanity check the sequence names against the sequence dictionary while scanning through. + if (!referenceSequence.getName().equals(samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName())) { + throw new PicardException("Sequence name mismatch at sequence index " + referenceSequence.getContigIndex() + + ": " + referenceSequence.getName() + " != " + + samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName()); + } + if (referenceSequence.getContigIndex() == sequenceIndex) { + break; + } + if (referenceSequence.getContigIndex() > sequenceIndex) { + throw new PicardException("Never found reference sequence with index " + sequenceIndex); + } + } + if (referenceSequence == null) { + throw new PicardException("Reference sequence with index " + sequenceIndex + " was not found"); + } + } + + /** + * Override this to implement a concrete genotype caller + * @param ref the reference base + * @param bases each element in the String is the base at current locus for a given read + * @param quals same length as bases. the ith element corresponds to the ith element of bases. + * @return + */ + abstract protected SortedSet call(char ref, String bases, List quals); + + + /** + * Apply a general population-based prior to the likelihood: + *
    + *
  • ref is .999
  • + *
  • het is 10^-3
  • + *
  • homozygous, non-reference is 10^-5
  • + * + * @param ref reference allele + * @return prior, given the reference and genotype alleles + */ + protected double getPrior(final char ref, final DiploidGenotype gt) { + final double prior; + if (gt.isHom() && gt.getAllele1() == ref) { + prior = 0.999; // reference + } else { + if (gt.getAllele1() != ref && gt.getAllele2() != ref) { + prior = 0.00001; // neither base is reference + } else { + prior = 0.001; // het, one base is reference + } + } + return prior; + } + + // -------------------------------------------------------------------------------------------- + // Helper methods below this point... + // -------------------------------------------------------------------------------------------- + + + public boolean isHet(final String alleles) { + return (alleles.charAt(0) != (alleles.charAt(1))); + } + + +} diff --git a/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java b/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java new file mode 100644 index 0000000000..06b5a42007 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java @@ -0,0 +1,93 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.genotype.caller; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.directed.GenomeMaskFactory; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.picard.sam.SamLocusIterator; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; + +/** + * Call genotypes given a SAM file of aligned reads, reference sequences, and optionally a target map. + */ +public class CallGenotypes extends CommandLineProgram { + // Usage and parameters + @Usage(programVersion="1.0") public String USAGE = "Basic Allele Caller\n"; + @Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE; + @Option(shortName="O", doc="Allele Call output GELI file") public File OUTPUT_FILE; + @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE; + @Option(shortName="T", doc="IntervalList-format target map file", optional = true) public File TARGET_FILE; + @Option(shortName="Q", doc="Minimum quality score threshold to use in allele calling", optional = true) public Integer QUAL_SCORE_THRESHOLD; + + + /** Required main method implementation. */ + public static void main(final String[] argv) { + System.exit(new CallGenotypes().instanceMain(argv)); + } + + + protected int doWork() { + try { + final BufferedWriter writer = new BufferedWriter(new FileWriter(OUTPUT_FILE)); + + final SAMFileReader samReader = getSamReader(INPUT_FILE); + + // TODO -- parameterize, or create separate executables... + // AbstractAlleleCaller caller = new FlatQualityAlleleCaller(reference, writer); + final AbstractAlleleCaller caller = new QualityScoreAlleleCaller(REF_FILE, samReader.getFileHeader(), writer); + final long startTime = System.currentTimeMillis(); + + final SamLocusIterator sli = new SamLocusIterator(samReader.iterator()); + + if (TARGET_FILE != null) { + sli.setGenomeMask(new GenomeMaskFactory().makeGenomeMaskFromIntervalList(TARGET_FILE)); + } + + if (QUAL_SCORE_THRESHOLD != null) { + System.out.println("Masking out bases with < Q"+QUAL_SCORE_THRESHOLD); + sli.setQualityScoreCutoff(QUAL_SCORE_THRESHOLD); + } + + for (final SamLocusIterator.LocusInfo li : sli) { + if (li != null) caller.callAlleles(li); + } + + final long elapsed = System.currentTimeMillis() - startTime; + System.out.println("Completed in " + elapsed + "ms"); + + writer.flush(); + writer.close(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + return 0; + } + + private SAMFileReader getSamReader(final File samFile) { + final SAMFileReader samReader = new SAMFileReader(samFile); + + // ensure the file is sorted + if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { + System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder()); + System.exit(1); + } + + return samReader; + } + +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java b/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java new file mode 100644 index 0000000000..1d9e01f583 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java @@ -0,0 +1,27 @@ +package edu.mit.broad.picard.genotype.caller; + +public enum DiploidGenotype { + AA('A','A'), + AC('A','C'), + AG('A','G'), + AT('A','T'), + CC('C','C'), + CG('C','G'), + CT('C','T'), + GG('G','G'), + GT('G','T'), + TT('T','T'); + + private final char allele1; + private final char allele2; + + private DiploidGenotype(final char allele1, final char allele2) { + this.allele1 = allele1; + this.allele2 = allele2; + } + + public char getAllele1() { return allele1; } + public char getAllele2() { return allele2; } + public boolean isHet() { return this.allele1 != this.allele2; } + public boolean isHom() { return this.allele1 == this.allele2; } +} diff --git a/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java b/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java new file mode 100644 index 0000000000..c437a911ee --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java @@ -0,0 +1,76 @@ +package edu.mit.broad.picard.genotype.caller; + +import edu.mit.broad.sam.SAMFileHeader; + +import java.io.IOException; +import java.io.BufferedWriter; +import java.io.File; +import java.util.*; +import static java.lang.Math.*; + + +/** + * Bayesian-based allele caller using flat qualities and a 1e-3 error rate, based on CRD algorithm + */ +public class FlatQualityAlleleCaller extends AbstractAlleleCaller { + + public FlatQualityAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) { + super(fastbReference, samHeader, writer); + } + + + protected SortedSet call(final char ref, final String bases, final List quals) { + final float eps = 1e-3f; + + // count up the base by nucleotide and put them into a map + final int depth = bases.length(); + int a = 0,c = 0,g = 0,t = 0; + for(int i=0; i< bases.length(); i++) { + if (bases.charAt(i) == 'A') { a++; } + else if (bases.charAt(i) == 'C') { c++; } + else if (bases.charAt(i) == 'G') { g++; } + else if (bases.charAt(i) == 'T') { t++; } + else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); } + } + + final Map counts = new HashMap(); + counts.put('A', a); + counts.put('C', c); + counts.put('G', g); + counts.put('T', t); + + + // for each of the 10 theories, calculate the likelihood + final SortedSet results = new TreeSet(); + for(final DiploidGenotype theory : DiploidGenotype.values()) { + final double likelihood; + final char allele1 = theory.getAllele1(); + final char allele2 = theory.getAllele2(); + + if (!theory.isHet()) { + likelihood = log10(1-eps)*counts.get(allele1) + log10(eps)*(depth - counts.get(allele1)); + } else { + final int major_allele_counts; + final int minor_allele_counts; + if (counts.get(allele1) > counts.get(allele2)) { + major_allele_counts = counts.get(allele1); + minor_allele_counts = counts.get(allele2); + } else { + major_allele_counts = counts.get(allele2); + minor_allele_counts = counts.get(allele1); + } + + likelihood = log10(0.5 - (eps/2.0) )*major_allele_counts + + log10(0.5 - (eps/2.0) )*minor_allele_counts + + log10(eps)*(depth - major_allele_counts - minor_allele_counts); + } + + final double prior = getPrior(ref, theory); + results.add(new GenotypeTheory(theory, likelihood + log10(prior))); + } + + + return results; + + } +} diff --git a/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java b/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java new file mode 100644 index 0000000000..a97e83a972 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java @@ -0,0 +1,46 @@ +package edu.mit.broad.picard.genotype.caller; + +/** + * Datastructure to hold a single genotype along with a likelihood. + */ +public class GenotypeTheory implements Comparable { + private DiploidGenotype genotype; + private double likelihood; + + public GenotypeTheory(final DiploidGenotype genotype, final double likelihood) { + this.genotype = genotype; + this.likelihood = likelihood; + } + + public DiploidGenotype getGenotype() { + return genotype; + } + + public void setGenotype(final DiploidGenotype genotype) { + this.genotype = genotype; + } + + public double getLikelihood() { + return likelihood; + } + + public void setLikelihood(final double likelihood) { + this.likelihood = likelihood; + } + + /** + * Genotype Theories are sorted first by descending likelihood (ie + * the GenotypeTheory with biggest likelihood comes first). Ties are + * broken by lexical sorting of the genotypes themselves + * + */ + public int compareTo(final GenotypeTheory other) { + if (this.getLikelihood() == other.getLikelihood()) { + return this.getGenotype().compareTo(other.getGenotype()); + } else if (this.getLikelihood() > other.getLikelihood()) { + return -1; + } else { + return 1; + } + } +} diff --git a/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java b/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java new file mode 100644 index 0000000000..f9863546d2 --- /dev/null +++ b/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java @@ -0,0 +1,82 @@ +package edu.mit.broad.picard.genotype.caller; + +import edu.mit.broad.sam.SAMFileHeader; + +import java.util.*; +import static java.lang.Math.log10; +import static java.lang.Math.pow; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.File; + +/** + * Bayesian-based allele caller using quality scores, based on CRD algorithm + */ +public class QualityScoreAlleleCaller extends AbstractAlleleCaller { + + public QualityScoreAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) { + super(fastbReference, samHeader, writer); + } + + protected SortedSet call(final char ref, final String bases, final List quals) { + + // for each of the 10 theories, calculate the likelihood using quality scores + final SortedSet results = new TreeSet(); + for(final DiploidGenotype theory : DiploidGenotype.values()) { + double likelihood = 0; + + for(int i=0; i, Iterable, Closeable { + + private final File bustardDirectory; + private final int lane; + private final boolean pairedEnd; + private PasteParser parser; + private BustardReadData next = null; + private final FormatUtil formatter = new FormatUtil(); + private boolean iterating = false; + + /** + * Constructor + * + * @param bustardDirectory directory where the Bustard files can be located + * @param lane the lane to parse + * @param pairedEnd whether this is a paired-end run + */ + public BustardFileParser(File bustardDirectory, int lane, boolean pairedEnd) { + this.bustardDirectory = bustardDirectory; + this.lane = lane; + this.pairedEnd = pairedEnd; + initialize(); + } + + /** + * Finds the relevant files in the bustardDirectory, sorts them, and puts them into the + * sortedFiles iterator. Does some basic sanity checking to ensure that some files + * are found and that they are the expected multiple for paired-end or not. + * + */ + private void initialize() + { + final String qseq1Regex = "s_" + lane + "_1_\\d{4}_qseq.txt(.gz)?"; + final String qseq2Regex = "s_" + lane + "_2_\\d{4}_qseq.txt(.gz)?"; + final String intensityRegex = "s_" + lane + "_\\d{4}_sig2.txt(.gz)?"; + + File read1files[] = bustardDirectory.listFiles( new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.matches(qseq1Regex); + } + }); + + File read2files[] = bustardDirectory.listFiles( new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.matches(qseq2Regex); + } + }); + + File intensityFiles[] = bustardDirectory.listFiles( new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.matches(intensityRegex); + } + }); + + // Some basic sanity checking on file counts + if (read1files.length == 0 && read2files.length == 0 && intensityFiles.length == 0) { + throw new PicardException("No Bustard files found in " + + bustardDirectory.getAbsolutePath() + " for lane " + lane); + } + if (pairedEnd) { + if (read1files.length != read2files.length || read2files.length != intensityFiles.length) { + throw new PicardException("Incorrect number of Bustard files found in " + + bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " + + read1files.length + " read 1 qseq files, " + read2files.length + " read 2 " + + "qseq files, and " + intensityFiles.length + " sig2 files. There should be " + + "the same number of each type of file"); + } + } + else { + if (read1files.length != intensityFiles.length) { + throw new PicardException("Incorrect number of Bustard files found in " + + bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " + + read1files.length + " qseq files and " + intensityFiles.length + " sig2 files, " + + "which should be equal."); + } + if (read2files.length > 0) { + throw new PicardException("Read 2 Bustard files found in " + + bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Lane " + + " was specified as a non-PE run, and so should not have any read 2 data."); + } + } + + // Sort each set of reads and create a text parser for it + SortedSet sortedRead1 = new TreeSet(new BustardFilenameComparator()); + sortedRead1.addAll(Arrays.asList(read1files)); + read1files = sortedRead1.toArray(read1files); + BasicTextFileParser read1Parser = new BasicTextFileParser(true, read1files); + + SortedSet sortedIntensity = new TreeSet(new BustardFilenameComparator()); + sortedIntensity.addAll(Arrays.asList(intensityFiles)); + intensityFiles = sortedIntensity.toArray(intensityFiles); + BasicTextFileParser intensityParser = new BasicTextFileParser(true, intensityFiles); + + // And create a paste parser for all of them + if (pairedEnd) { + SortedSet sortedRead2 = new TreeSet(new BustardFilenameComparator()); + sortedRead2.addAll(Arrays.asList(read2files)); + read2files = sortedRead2.toArray(read2files); + BasicTextFileParser read2Parser = new BasicTextFileParser(true, read2files); + + parser = new PasteParser(read1Parser, read2Parser, intensityParser); + } + else { + parser = new PasteParser(read1Parser, intensityParser); + } + } + + /** + * Parses the next line from the parser and constructs a BustardReadData object from it + * The first 11 fields are the read1 data, the second 11 are the read2 data, and the remaining + * values are the intensities data. Note that the first four values in the intensity file + * are not intensities but rather lane, tiles, x, and y for the given cluster. + * + * @param validate whether to check that the expected number of intensity values are returned + * @return a fully populated BustardReadData object + */ + private BustardReadData readNext(boolean validate) { + if (!parser.hasNext()) { + return null; + } + String data[][] = parser.next(); + String machine = data[0][0]; + int run = formatter.parseInt(data[0][1]); + int lane = formatter.parseInt(data[0][2]); + int tile = formatter.parseInt(data[0][3]); + int x = formatter.parseInt(data[0][4]); + int y = formatter.parseInt(data[0][5]); + String firstSeq = data[0][8]; + String firstQual = data[0][9]; + boolean pf = formatter.parseInt(data[0][10]) == 1; + String secondSeq = null; + String secondQual = null; + + int intensityIndex = 1; + if (pairedEnd) { + secondSeq = data[1][8]; + secondQual = data[1][9]; + intensityIndex = 2; + } + + int numIntensities = firstSeq.length() * (pairedEnd ? 2 : 1); + + // Sanity check since some of those files look a little weird + if (validate) { + int remaining = data[intensityIndex].length - 4; + if ((remaining % 4 != 0) || (remaining/4) != numIntensities) { + throw new PicardException("Unexpected number of intensity fields for " + machine + "/" + run + + "/" + lane + "/" + tile + ": " + remaining); + } + } + + double intensities[][] = new double[numIntensities][4]; + int intensityArrayIndex = 4; + for (int i = 0; i < numIntensities; i++) { + for (int j = 0; j < 4; j++) { + intensities[i][j] = formatter.parseDouble(data[intensityIndex][intensityArrayIndex++]); + } + } + + return new BustardReadData( + machine, run, lane, tile, firstSeq, firstQual, secondSeq, secondQual, pf, intensities, x, y); + + } + + /** + * Returns an iterator over a set of elements of type BustardReadData. + * + * @return an iterator over a set of elements of type BustardReadData + */ + public Iterator iterator() { + if (iterating) { + throw new IllegalStateException("iterator() method can only be called once, before the" + + "first call to hasNext()"); + } + next = readNext(true); + iterating = true; + return this; + } + + /** + * Returns true if the iteration has more elements. + * + * @return true if the iteration has more elements. Otherwise returns false. + */ + public boolean hasNext() { + if (!iterating) { + next = readNext(true); + iterating = true; + } + return next != null; + } + + /** + * Returns the next element in the iteration. + * + * @return the next element in the iteration + * @throws java.util.NoSuchElementException + */ + public BustardReadData next() { + + if (!hasNext()) { + throw new NoSuchElementException("Iteration has no more elements."); + } + + BustardReadData result = next; + next = readNext(false); + return result; + } + + /** + * Required method for Iterator API. + * + * @throws UnsupportedOperationException + */ + public void remove() { + throw new UnsupportedOperationException("Remove() not supported."); + } + + /** + * Closes the underlying PasteParser + */ + public void close() { + if (parser != null) { + parser.close(); + } + } + + public int getLane() { return this.lane; } + public boolean isPairedEnd() { return this.pairedEnd; } +} diff --git a/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java b/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java new file mode 100644 index 0000000000..ad92377f11 --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java @@ -0,0 +1,78 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +import java.io.File; +import java.util.Comparator; + +/** + * Comparator for getting Bustard files in "sorted" order for use by the BustardFileParser. Expected order is + * by lane in ascending order, then by tile in ascending order, then: + * the read 1 qseq file + * the read 2 qseq file + * the sig2 file + * + * IMPORTANT: Currently this class expects to receive ONLY qseq and sig2 files. + * + * @author Kathleen Tibbetts + */ +public class BustardFilenameComparator implements Comparator { + + /** + * Compares its two arguments for order. Returns a negative integer, zero, or a positive integer as + * the first argument is less than, equal to, or greater than the second. + * + * @param file1 + * @param file2 + * @return a negative integer, zero, or a positive integer as + * the first argument is less than, equal to, or greater than the second. + */ + public int compare(File file1, File file2) + { + Integer parts1[] = parseFileNameParts(file1.getName()); + Integer parts2[] = parseFileNameParts(file2.getName()); + + for (int i = 1; i < parts1.length; i++) + { + if (!parts1[i].equals(parts2[i])) { + return parts1[i].compareTo(parts2[i]); + } + } + return 0; + } + + /** + * Utility method that returns an array of integers that represent, in order, + * the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any) + * represented by the given file name + * + * @param name + * @return an array of integers that represent, in order, + * the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any) + * represented by the given file name + */ + private Integer[] parseFileNameParts(String name) + { + Integer parts[] = new Integer[4]; // Lane, tile, type, read + String src[] = name.split("_"); + parts[0] = new Integer(src[1]); // Lane is always the second part + if (src[2].length() == 4) { // Tile is 3rd or fourth + parts[1] = new Integer(src[2]); + } + else { + parts[1] = new Integer(src[3]); + } + parts[2] = (src[src.length-1].equals("qseq.txt")) ? 0 : 1; // qseq tests are lower + if (src[2].length() == 1) { // read is last + parts[3] = new Integer(src[2]); + } + return parts; + } +} diff --git a/lib/edu/mit/broad/picard/illumina/BustardReadData.java b/lib/edu/mit/broad/picard/illumina/BustardReadData.java new file mode 100644 index 0000000000..6076f36e04 --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/BustardReadData.java @@ -0,0 +1,128 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +/** + * Holds all the Bustard-level data we need (so far) about an individual read. + * + * @author Kathleen Tibbetts + */ +public class BustardReadData { + + private static final String PADDING ="00000"; + + final private String machineName; + final private int runNumber; + final private int laneNumber; + final private int tileNumber; + final private String firstReadSequence; + final private String firstReadQualities; + final private String secondReadSequence; + final private String secondReadQualities; + final private boolean pf; + final private double intensities[][]; + final private int xCoordinate; + final private int yCoordinate; + private final SolexaQualityConverter converter = new SolexaQualityConverter(); + + + /** + * Constructor that takes everything to populate this object + * + * @param machineName + * @param runNumber + * @param laneNumber + * @param tileNumber + * @param firstReadSequence + * @param firstReadQualities + * @param secondReadSequence + * @param secondReadQualities + * @param pf + * @param intensities + * @param xCoordinate + * @param yCoordinate + */ + public BustardReadData(String machineName, int runNumber, int laneNumber, int tileNumber, + String firstReadSequence, String firstReadQualities, + String secondReadSequence, String secondReadQualities, + boolean pf, double[][] intensities, int xCoordinate, int yCoordinate ) { + + this.machineName = machineName; + this.runNumber = runNumber; + this.laneNumber = laneNumber; + this.tileNumber = tileNumber; + this.firstReadSequence = firstReadSequence; + this.firstReadQualities = firstReadQualities; + this.secondReadSequence = secondReadSequence; + this.secondReadQualities = secondReadQualities; + this.pf = pf; + this.intensities = intensities; + this.xCoordinate = xCoordinate; + this.yCoordinate = yCoordinate; + } + + // TODO: Finalize read name -- ask Tim + /** + * Composes a name for this read from its values + * + * @return the read name + */ + public String getReadName() { + return this.machineName + ":" + this.laneNumber + ":" + this.tileNumber + + ":" + this.xCoordinate + ":" + this.yCoordinate; + } + + /** + * Gets Phred-style qualitites for the first read + * + * @return the String of qualities + */ + public String getFirstReadPhredQualities() { + return decodeSolexaQualitiesToPhred(getFirstReadQualities()); + } + + /** + * Gets Phred-style qualitites for the second read + * + * @return the String of qualities + */ + public String getSecondReadPhredQualities() { + return decodeSolexaQualitiesToPhred(getSecondReadQualities()); + } + + /** + * Converts a string of Solexa qualities to a Phred-style quality String + * + * @param qualities the Solexa qualities to decode + * @return the String of Phred qualities + */ + private String decodeSolexaQualitiesToPhred(String qualities) { + StringBuilder sb = new StringBuilder(); + for (char c : qualities.toCharArray()) { + // Quality char is phred score + 33 + sb.append((char)(converter.solexaToPhred((byte)c)+33)); + } + return sb.toString(); + } + + public String getMachineName() { return machineName; } + public int getRunNumber() { return runNumber; } + public int getLaneNumber() { return laneNumber; } + public int getTileNumber() { return tileNumber; } + public String getFirstReadSequence() { return firstReadSequence; } + public String getFirstReadQualities() { return firstReadQualities; } + public String getSecondReadSequence() { return secondReadSequence; } + public String getSecondReadQualities() { return secondReadQualities; } + public double[][] getIntensities() { return intensities; } + public boolean isPf() { return pf; } + public int getXCoordinate() { return xCoordinate; } + public int getYCoordinate() { return yCoordinate; } + +} diff --git a/lib/edu/mit/broad/picard/illumina/BustardToSam.java b/lib/edu/mit/broad/picard/illumina/BustardToSam.java new file mode 100644 index 0000000000..eb88e34651 --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/BustardToSam.java @@ -0,0 +1,58 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +import java.io.File; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; + +/** + * CommandLineProgram to generate to invoke BustardToBamWriter + * + * @author Kathleen Tibbetts + */ +public class BustardToSam extends CommandLineProgram { + // The following attributes define the command-line arguments + @Usage(programVersion="1.0") + public String USAGE = + "Usage: " + getClass().getName() + " [options]\n\n" + + "Generate a BAM binary file from data in an illumina Bustard directory.\n"; + + @Option(shortName = "B", doc = "Bustard directory to parse. ") + public File BUSTARD_DIRECTORY; + + @Option(shortName = "F", doc = "The flowcell. ") + public String FLOWCELL; + + @Option(shortName = "L", doc = "The lane for which to parse data. ") + public Integer LANE; + + @Option(shortName = "P", doc = "Whether the lane was a paired-end run. ") + public Boolean PE; + + @Option(shortName = "O", doc = "The directory for the binary output file. ") + public File OUTPUT; + + @Override + protected int doWork() { + BustardToSamWriter writer = new BustardToSamWriter( + new BustardFileParser(BUSTARD_DIRECTORY, LANE, PE), OUTPUT, FLOWCELL); + writer.writeBamFile(); + return 0; + } + + public static void main(String[] argv) { + System.exit(new BustardToSam().instanceMain(argv)); + } + + +} diff --git a/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java b/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java new file mode 100644 index 0000000000..05c92c2461 --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java @@ -0,0 +1,138 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +import edu.mit.broad.sam.*; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.picard.filter.AggregateFilter; +import edu.mit.broad.picard.filter.SamRecordFilter; +import edu.mit.broad.picard.filter.SolexaNoiseFilter; +import edu.mit.broad.picard.sam.ReservedTagConstants; + +import java.io.File; +import java.util.*; + +/** + * Writes the data from a BustardFileParser to a BAM file + */ +public class BustardToSamWriter { + + private final BustardFileParser parser; + private SAMFileWriter writer; + private final File outputFile; + private AggregateFilter filters; + private int recordsWritten = 0; + private Log log = Log.getInstance(BustardToSamWriter.class); + + /** + * Constructor + * + * @param parser The parser for the Bustard data + * @param outputDirectory The directory in which to write the BAM file + * @param flowcell The flowcell from which the data is drawn + */ + public BustardToSamWriter(BustardFileParser parser, File outputDirectory, String flowcell) { + this.parser = parser; + this.outputFile = getOutputFile(outputDirectory, flowcell); + initializeFilters(); + } + + /** + * Alternate constructor for testing + * + * @param parser The parser for the Bustard data + * @param outputFile The directory in which to write the BAM file + */ + BustardToSamWriter(BustardFileParser parser, File outputFile) { + this.parser = parser; + this.outputFile = outputFile; + initializeFilters(); + } + + private void initializeFilters() { + filters = new AggregateFilter(Arrays.asList( + (SamRecordFilter)new SolexaNoiseFilter() + )); + } + + + /** + * Writes all data from the BustardFileParser to a BAM file + */ + public void writeBamFile() { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + writer = new SAMFileWriterFactory().makeBAMWriter(header, false, outputFile); + + while (parser.hasNext()) { + BustardReadData brd = parser.next(); + + SAMRecord sam = createSamRecord(brd, true); + writer.addAlignment(sam); + this.recordsWritten++; + + if (parser.isPairedEnd()) { + SAMRecord sam2 = createSamRecord(brd, false); + writer.addAlignment(sam2); + this.recordsWritten++; + } + + } + writer.close(); + + log.info("Wrote " + this.recordsWritten + " read records to BAM file " + + this.outputFile.getAbsolutePath()); + } + + /** + * Creates a SAMRecord from Bustard data + * + * @param brd The BustardReadData to use in populating the SAMRecord + * @param isFirstRead whether this is the first read of a pair + * @return SAMRecord fully populated SAMRecord + */ + private SAMRecord createSamRecord(BustardReadData brd, boolean isFirstRead) { + SAMRecord sam = new SAMRecord(); + sam.setReadName(brd.getReadName()); + sam.setReadString(isFirstRead ? brd.getFirstReadSequence() : brd.getSecondReadSequence()); + sam.setBaseQualityString(isFirstRead ? brd.getFirstReadPhredQualities() : brd.getSecondReadPhredQualities()); + + // Flag values + sam.setReadPairedFlag(parser.isPairedEnd()); + sam.setReadUmappedFlag(true); + sam.setReadFailsVendorQualityCheckFlag(!brd.isPf()); + sam.setMateUnmappedFlag(true); + if (parser.isPairedEnd()) { + sam.setFirstOfPairFlag(isFirstRead); + sam.setSecondOfPairFlag(!isFirstRead); + } + + if (filters.filterOut(sam)) { + sam.setAttribute(ReservedTagConstants.XN, 1); + } + return sam; + } + + /** + * Constructs the name for the output file, determines whether it is writeable, + * and returns the file + * + * @param outputDirectory the directory in which to write the BAM file + * @param flowcell the flowcell from which the data is drawn + * @return a new File object for the BAM file. + */ + private File getOutputFile(File outputDirectory, String flowcell) { + File result = new File(outputDirectory.getAbsolutePath() + "/" + + flowcell + "." + parser.getLane() + ".unmapped.bam"); + IoUtil.assertFileIsWritable(result); + return result; + } +} diff --git a/lib/edu/mit/broad/picard/illumina/GeraldParser.java b/lib/edu/mit/broad/picard/illumina/GeraldParser.java new file mode 100644 index 0000000000..a72f90dbac --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/GeraldParser.java @@ -0,0 +1,235 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +import edu.mit.broad.picard.util.PasteParser; +import edu.mit.broad.picard.util.TabbedTextFileParser; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.sam.util.CloseableIterator; + +import java.io.File; +import java.util.Iterator; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.text.ParsePosition; +import java.text.NumberFormat; + +/** + * Parse the pair of files (eland_extended.txt and export.txt) that correspond to an end of a Gerald run for a lane. + */ +public class GeraldParser implements Iterable, CloseableIterator { + private static final int EXPECTED_ELAND_FIELDS = 4; + // Regex used to split apart multiple alignments in the eland output + private static final Pattern ALIGN_SPLITTER = Pattern.compile("\\,+"); + + // export.txt constants + private static final int PASSING_FILTER_COLUMN = 21; + private static final int QUALITIES_COLUMN = 9; + private static final int REQUIRED_EXPORT_COLUMNS = PASSING_FILTER_COLUMN + 1; + + private final NumberFormat integerFormat = NumberFormat.getIntegerInstance(); + + private final SquashedCoordinateMap geraldToArachne; + private final PasteParser pasteParser; + private final File elandExtended; + private final File export; + private boolean iteratorCalled = false; + private final byte[] solexaToPhredQualityConverter = new SolexaQualityConverter().getSolexaToPhredConversionTable(); + + /** + * @param geraldToArachne for converting btw Gerald coordinate and genomic coordinate + */ + public GeraldParser(final SquashedCoordinateMap geraldToArachne, final File elandExtended, final File export) { + this.geraldToArachne = geraldToArachne; + this.elandExtended = elandExtended; + this.export = export; + final TabbedTextFileParser[] parsers = { + new TabbedTextFileParser(false, elandExtended), + new TabbedTextFileParser(false, export) + }; + pasteParser = new PasteParser(parsers); + } + + public Iterator iterator() { + if (iteratorCalled) { + throw new IllegalStateException("iterator() cannot be called more than once on a GeraldParser instance."); + } + iteratorCalled = true; + return this; + } + + public void close() { + pasteParser.close(); + } + + public boolean hasNext() { + return pasteParser.hasNext(); + } + + public GeraldAlignment next() { + final GeraldAlignment ret = new GeraldAlignment(); + final String[][] fields = pasteParser.next(); + + // Parse eland_extended.txt fields + final String[] elandExtendedFields = fields[0]; + if (elandExtendedFields.length < EXPECTED_ELAND_FIELDS) { + throw new PicardException("Not enough fields in file: " + elandExtended); + } + + ret.readName = elandExtendedFields[0].substring(1); + ret.readBases = elandExtendedFields[1]; + ret.readLength = ret.readBases.length(); + final String[] alignCounts = elandExtendedFields[2].split(":"); + if (alignCounts.length == 3) { + ret.zeroMismatchPlacements = Short.parseShort(alignCounts[0]); + ret.oneMismatchPlacements = Short.parseShort(alignCounts[1]); + ret.twoMismatchPlacements = Short.parseShort(alignCounts[2]); + } + + final String[] alignments = ALIGN_SPLITTER.split(elandExtendedFields[3]); + if (alignments.length == 1 && !"-".equals(alignments[0])) { + final int lastDot = alignments[0].lastIndexOf("."); + final int colon = alignments[0].indexOf(':'); + + final String tmp = alignments[0].substring(colon + 1); + final ParsePosition pos = new ParsePosition(0); + final long start = integerFormat.parse(tmp, pos).longValue(); + if (pos.getIndex() == 0) { + throw new RuntimeException("Problem parsing eland extended alignment record: " + Arrays.toString(elandExtendedFields)); + } + + final SimpleMapping m = new SimpleMapping(alignments[0].substring(lastDot+1, colon).trim(), + start, start + ret.readLength - 1, null); + geraldToArachne.convertToArachneCoords(m); + ret.primaryChrom = m.getSequenceName(); + ret.primaryStart = m.getStartPos(); + ret.primaryStop = m.getEndPos(); + ret.orientation = tmp.substring(pos.getIndex(), pos.getIndex() + 1); + ret.mismatchString = tmp.substring(pos.getIndex() + 1); + + // Count the mismatches in the alignment + for (int i=pos.getIndex(); i readGroups = new ArrayList(); + readGroups.add(readGroup); + readGroup.setSample(SAMPLE); + if (LIBRARY != null) { + readGroup.setLibrary(LIBRARY); + } + setRGAttributeIfNotNull(readGroup, DESCRIPTION, "DS"); + setRGAttributeIfNotNull(readGroup, RUN, "PU"); + setRGAttributeIfNotNull(readGroup, PI, SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG); + setRGAttributeIfNotNull(readGroup, CN, "CN"); + setRGAttributeIfNotNull(readGroup, RUN_DATE, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG); + setRGAttributeIfNotNull(readGroup, PL, "PL"); + header.setReadGroups(readGroups); + } + } + + private void setRGAttributeIfNotNull(final SAMReadGroupRecord readGroup, final Object value, final String key) { + if (value == null) { + return; + } + readGroup.setAttribute(key, value); + } + + /** + * Iterate through the Gerald output and write alignments. eland_extended.txt and export.txt are + * iterated together using PasteParser. If paired end lane, then two PasteParsers are iterated in tandem, + * so that mate info is available when a SAMRecord is created. + */ + private void writeAlignments() { + final GeraldParserFactory geraldParserFactory = new GeraldParserFactory(GERALD_DIR, LANE, SQUASHED_MAP); + paired = geraldParserFactory.isPairedRun(); + final GeraldParser firstEndIterator = geraldParserFactory.makeParser(paired ? 1: null); + GeraldParser secondEndIterator = null; + if (paired) { + secondEndIterator = geraldParserFactory.makeParser(2); + } + int numAlignmentsOrPairsWritten = 0; + while (firstEndIterator.hasNext()) { + final GeraldParser.GeraldAlignment firstEnd = firstEndIterator.next(); + GeraldParser.GeraldAlignment secondEnd = null; + if (paired) { + hasNextAssert(secondEndIterator); + secondEnd = secondEndIterator.next(); + } + final SAMRecord firstEndAlignment = createSAMRecordFromGerald(firstEnd); + SAMRecord secondEndAlignment = null; + if (paired) { + secondEndAlignment = createSAMRecordFromGerald(secondEnd); + setMateInfo(secondEndAlignment, firstEnd); + setMateInfo(firstEndAlignment, secondEnd); + secondEndAlignment.setSecondOfPairFlag(true); + firstEndAlignment.setFirstOfPairFlag(true); + final boolean properPair = SamPairUtil.isProperPair(firstEndAlignment, secondEndAlignment, JUMPING_LIBRARY); + firstEndAlignment.setProperPairFlag(properPair); + secondEndAlignment.setProperPairFlag(properPair); + int insertSize = SamPairUtil.computeInsertSize(firstEndAlignment, secondEndAlignment); + firstEndAlignment.setInferredInsertSize(insertSize); + secondEndAlignment.setInferredInsertSize(-insertSize); + } + + writer.addAlignment(firstEndAlignment); + if (secondEndAlignment != null) { + writer.addAlignment(secondEndAlignment); + } + ++numAlignmentsOrPairsWritten; + if (MAX_ALIGNMENTS != null && numAlignmentsOrPairsWritten >= MAX_ALIGNMENTS) { + break; + } + if (numAlignmentsOrPairsWritten % 500000 == 0) { + log.info("Loaded " + numAlignmentsOrPairsWritten + " reads"); + } + } + if (MAX_ALIGNMENTS == null) { + noMoreAssert(firstEndIterator); + if (paired) { + noMoreAssert(secondEndIterator); + } + } + log.info("Done loading " + numAlignmentsOrPairsWritten + " reads"); + } + + /** + * Write into the samRecord the mate info from the mate gerald alignment + */ + private void setMateInfo(final SAMRecord samRecord, final GeraldParser.GeraldAlignment mateGeraldAlignment) { + final boolean isMapped = mateGeraldAlignment.getPrimaryChrom() != null; + if (isMapped) { + samRecord.setMateReferenceName(mateGeraldAlignment.getPrimaryChrom()); + samRecord.setMateAlignmentStart((int)mateGeraldAlignment.getPrimaryStart()); + samRecord.setMateNegativeStrandFlag(isNegativeStrand(mateGeraldAlignment)); + } else { + samRecord.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); + samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + samRecord.setMateUnmappedFlag(true); + } + } + + private boolean isNegativeStrand(final GeraldParser.GeraldAlignment alignment) { + final String orientation = alignment.getOrientation(); + if (orientation.equals("F")) { + return false; + } else if (orientation.equals("R")) { + return true; + } else { + throw new RuntimeException("Strange orientation in eland_extended file"); + } + } + + private SAMRecord createSAMRecordFromGerald(final GeraldParser.GeraldAlignment alignment) { + final SAMRecord samRecord = new SAMRecord(); + // Consider an alignment with a negative start (i.e. that hangs off the beginning of the contig) + // to be unmapped. + final boolean isMapped = alignment.getPrimaryChrom() != null && alignment.getPrimaryStart() >= 0; + + String readName = alignment.getReadName(); + if (readName.endsWith("/1") || readName.endsWith("/2")) { + readName = readName.substring(0, readName.length() - 2); + } + samRecord.setReadName(readName); + + // Set all the flags + samRecord.setReadPairedFlag(paired); + samRecord.setReadUmappedFlag(!isMapped); + if (isMapped) { + samRecord.setReadNegativeStrandFlag(isNegativeStrand(alignment)); + } + // For now we are only taking the primary alignment + samRecord.setNotPrimaryAlignmentFlag(false); + String readBases = alignment.getReadBases(); + if (samRecord.getReadNegativeStrandFlag()) { + readBases = SequenceUtil.reverseComplement(readBases); + } + samRecord.setReadString(readBases); + final byte[] phredQualities = alignment.getPhredQualities(); + if (isMapped && samRecord.getReadNegativeStrandFlag()) { + ArrayUtil.reverseArray(phredQualities); + } + samRecord.setBaseQualities(phredQualities); + if (isMapped) { + /* + if ("23".equals(geraldReferenceName)) { + geraldReferenceName = "X"; + } else if ("24".equals(geraldReferenceName)) { + geraldReferenceName = "Y"; + } + return REFERENCE_PREFIX + geraldReferenceName; + */ + samRecord.setReferenceName(alignment.getPrimaryChrom()); + samRecord.setAlignmentStart((int)alignment.getPrimaryStart()); + samRecord.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY); + // CIGAR is trivial because there are no indels or clipping in Gerald + final String cigar = Integer.toString(alignment.getReadLength()) + "M"; + samRecord.setCigarString(cigar); + // We've decided not to bother with this, and just load the reference + // if we want to determine mismatches. + // samRecord.setAttribute("MD", alignment.getMismatchString()); + } else { + samRecord.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); + samRecord.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + samRecord.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); + samRecord.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); + } + + if (SAMPLE != null) { + // There is a read group (id = READ_GROUP_ID) + samRecord.setAttribute("RG", READ_GROUP_ID); + } + + samRecord.setAttribute("PG", PROGRAM_RECORD_ID); + return samRecord; + } + + private void hasNextAssert(final Iterator iterator) { + if (!iterator.hasNext()) { + throw new RuntimeException("gerald output file ends unexpectedly."); + + } + } + + private void noMoreAssert(final Iterator iterator) { + if (iterator.hasNext()) { + throw new RuntimeException("gerald output file has more lines than expected."); + } + } + +} diff --git a/lib/edu/mit/broad/picard/illumina/SimpleMapping.java b/lib/edu/mit/broad/picard/illumina/SimpleMapping.java new file mode 100644 index 0000000000..a1797d58d6 --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/SimpleMapping.java @@ -0,0 +1,117 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +import edu.mit.broad.sam.util.CoordMath; + +class SimpleMapping implements Comparable { + String arachneIndex; + long startPos; + long endPos; + String sequenceName; + + public SimpleMapping(final String arachneIndex, final long startPos, final long endPos, final String sequenceName) { + this.arachneIndex = arachneIndex; + this.startPos = startPos; + this.endPos = endPos; + this.sequenceName = sequenceName; + + if (this.endPos < this.startPos) throw new IllegalArgumentException("startPos must be less than endPos!"); + } + + public String getArachneIndex() { + return arachneIndex; + } + + public void setArachneIndex(final String arachneIndex) { + this.arachneIndex = arachneIndex; + } + + public long getStartPos() { + return startPos; + } + + public void setStartPos(final long startPos) { + this.startPos = startPos; + } + + public long getEndPos() { + return endPos; + } + + public void setEndPos(final long endPos) { + this.endPos = endPos; + } + + public String getSequenceName() { + return sequenceName; + } + + public void setSequenceName(final String sequenceName) { + this.sequenceName = sequenceName; + } + + public SimpleMapping intersection(final SimpleMapping other) { + if (this.intersects(other)) { + return new SimpleMapping(this.getArachneIndex(), + (this.getStartPos() >= other.getStartPos())?this.getStartPos():other.getStartPos(), + (this.getEndPos() <= other.getEndPos())?this.getEndPos():other.getEndPos(), this.getSequenceName()); + } + + return null; + } + + public boolean intersects(final SimpleMapping other) { + return (this.getArachneIndex().equals(other.getArachneIndex()) && + CoordMath.overlaps(this.getStartPos(), this.getEndPos(), other.getStartPos(), other.getEndPos())); + } + + public long length() { + return CoordMath.getLength(startPos, endPos); + } + + /** + * Sort based on sequence.compareTo, then start pos, then end pos + * with null objects coming lexically last + */ + public int compareTo(final SimpleMapping that) { + if (that == null) return -1; // nulls last + + int result = this.getArachneIndex().compareTo(that.getArachneIndex()); + if (result == 0) { + if (this.getStartPos() == that.getStartPos()) { + result = ((int) (this.getEndPos() - that.getEndPos())); + } else { + result = ((int) (this.getStartPos() - that.getStartPos())); + } + } + + // normalize to -1, 0, 1 + if (result > 1) result = 1; + else if (result < -1) result = -1; + return result; + } + + public boolean equals(final SimpleMapping that) { + return (this.compareTo(that) == 0); + } + + public int hashCode() { + int result; + result = arachneIndex.hashCode(); + result = 31 * result + (int) (startPos ^ (startPos >>> 32)); + result = 31 * result + (int) (endPos ^ (endPos >>> 32)); + return result; + } + + public String toString() { + return getArachneIndex() + ":" + getStartPos() + "-" + getEndPos(); + } +} diff --git a/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java b/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java new file mode 100644 index 0000000000..80633fb724 --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java @@ -0,0 +1,58 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +/** + * Optimized method for converting Solexa ASCII qualities into Phred scores. + * Pre-computes all values in order to eliminate repeated computation. + */ +public class SolexaQualityConverter { + + /** + * This value is added to a Solexa quality score to make it printable ASCII + */ + private static int SOLEXA_ADDEND = 64; + + /** + * Mapping from ASCII value in Gerald export file to phred score + */ + private final byte[] phredScore = new byte[256]; + + public SolexaQualityConverter() { + for (int i = 0; i < SOLEXA_ADDEND; ++i) { + phredScore[i] = 0; + } + for (int i = SOLEXA_ADDEND; i < phredScore.length; ++i) { + phredScore[i] = decodeSolexaQualityToPhred(i); + } + } + + + /** Converts a solexa character quality into a phred numeric quality. */ + private byte decodeSolexaQualityToPhred(final int solexaQuality) { + return (byte) Math.round(10d * Math.log10(1d+Math.pow(10d, (solexaQuality - SOLEXA_ADDEND)/10d))); + } + + /** + * Convert a solexa quality ASCII character into a phred score. + */ + public byte solexaToPhred(final byte solexaQuality) { + return phredScore[solexaQuality]; + } + + /** + * @return a byte array that can be indexed by Solexa ASCII quality, with value + * of corresponding Phred score. Elements 0-63 are invalid because Solexa qualities + * should all be >= 64. Do not modify this array! + */ + public byte[] getSolexaToPhredConversionTable() { + return phredScore; + } +} diff --git a/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java b/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java new file mode 100644 index 0000000000..92011add26 --- /dev/null +++ b/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java @@ -0,0 +1,75 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.illumina; + +import edu.mit.broad.sam.util.CoordMath; +import edu.mit.broad.picard.cmdline.CommandLineUtils; + +import java.util.Map; +import java.util.HashMap; +import java.io.File; +import java.io.BufferedReader; +import java.io.IOException; + +public class SquashedCoordinateMap { + private final Map geraldToArachne = new HashMap(); + private long genomeSize; + + public SquashedCoordinateMap(final File squashedMapFile) { + try { + final BufferedReader in = CommandLineUtils.getReader(squashedMapFile); + String line; + genomeSize = 0; + + while ((line = in.readLine()) != null) { + final String[] fields = CommandLineUtils.SPACE_SPLITTER.split(line); + final String arachneIndex = fields[0].trim().intern(); + final String squashedRefIndex = fields[1].trim().intern(); + final long squashedStart = Long.parseLong(fields[2]); + final long length = Long.parseLong(fields[3]); + final String sequenceName = fields[4]; + + final SimpleMapping mapping = new SimpleMapping(squashedRefIndex, squashedStart, + CoordMath.getEnd(squashedStart, length), sequenceName); + geraldToArachne.put(mapping, arachneIndex); + + genomeSize += length; + } + + in.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /* Converts a read's mapping from Gerald's vretarded space to arachne index + coords. */ + public void convertToArachneCoords(final SimpleMapping read) { + if (this.geraldToArachne == null || this.geraldToArachne.isEmpty()) { + throw new IllegalStateException("Cannot invoke convertToArachneCoords before parseSquashedMapFile"); + } + + for (final Map.Entry entry : this.geraldToArachne.entrySet()) { + final SimpleMapping chunk = entry.getKey(); + if (chunk.intersects(read)) { + read.setArachneIndex(entry.getValue()); + read.setStartPos( read.getStartPos() - chunk.getStartPos() ); + read.setEndPos( read.getEndPos() - chunk.getStartPos() ); + read.setSequenceName(chunk.getSequenceName()); + return; + } + } + + throw new RuntimeException("Could not convert read: " + read); + } + + long getGenomeSize() { + return genomeSize; + } +} diff --git a/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java b/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java new file mode 100644 index 0000000000..8bd01c755b --- /dev/null +++ b/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java @@ -0,0 +1,82 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.importer.genotype; + +import java.io.Closeable; +import java.io.File; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.sam.util.BinaryCodec; + +/** + * + * + * @author Doug Voet + */ +public class BedFileReader implements Closeable { + private static final int LOWEST_2_BIT_MASK = 3; // binary 11 + private static final short BED_MAGIC_NUMBER = 7020; +// private static final short BED_MAGIC_NUMBER = Short.parseShort("0110110000011011", 2); + + public static final byte MODE_INDIVIDUAL_MAJOR = 0; + public static final byte MODE_SNP_MAJOR = 1; + + public static final byte GENOTYPE_AA = 0; // binary 00 + public static final byte GENOTYPE_NO_CALL = 1; // binary 01 + public static final byte GENOTYPE_AB = 2; // binary 10 + public static final byte GENOTYPE_BB = 3; // binary 11 + + private final byte mode; + private final BinaryCodec codec; + private byte currentBlock; + private int genotypeCount = 0; + + public BedFileReader(File bedFile) { + this.codec = new BinaryCodec(bedFile, false); + short fileMagicNumber = this.codec.readShort(); + if (fileMagicNumber != BED_MAGIC_NUMBER) { + this.codec.close(); + throw new PicardException("Given file [" + bedFile.getAbsolutePath() + + "] is not in bed file format... magic number does not match"); + } + this.mode = codec.readByte(); + } + + public byte getMode() { + return mode; + } + + @Override + public void close() { + this.codec.close(); + } + + public byte nextGenotype() { + // there are 4 genotypes per byte so get a new byte every 4 genotypes read + if (this.genotypeCount++ % 4 == 0) { + this.currentBlock = this.codec.readByte(); + } + + // the 2 lowest order bits of currentBlock are the next genotype, pop them off + byte genotype = (byte) (LOWEST_2_BIT_MASK & this.currentBlock); + this.currentBlock >>>= 2; + + return genotype; + } + + /** + * Call this method when moving on to the next individual (in indiv-major mode) or next + * snp (in snp-major mode). + */ + public void dropRemainingBlock() { + this.genotypeCount = 0; + } +} diff --git a/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java b/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java new file mode 100644 index 0000000000..8a735207e3 --- /dev/null +++ b/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java @@ -0,0 +1,371 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.importer.genotype; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.genotype.GeliFileWriter; +import edu.mit.broad.picard.genotype.GenotypeLikelihoods; +import edu.mit.broad.picard.genotype.GenotypeLikelihoodsCodec; +import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.util.BasicTextFileParser; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.sam.SAMTextHeaderCodec; +import edu.mit.broad.sam.util.AsciiLineReader; +import edu.mit.broad.sam.util.SortingCollection; + +/** + * Converts a BED/BIM/FAM file trio to a number of GELI files (1 per individual). + * BED files come in 2 formats, individual-major and snp-major. The former lists all SNPs for the + * first individual then all SNPs for the second individual, etc. The latter list all individuals + * for first SNP then all individuals for second SNP, etc. The order for snps is dictated by + * the bim file and the order for individuals is dictated by the fam file. + *

    + * See this page for details + * of the format. + * + * @author Doug Voet + */ +public class BedToGeli extends CommandLineProgram { + static final float LIKELIHOOD = 500; + private static final Log log = Log.getInstance(BedToGeli.class); + + @Usage(programVersion="1.0") + public final String USAGE = ""; + + @Option(doc="The bed file name.", mutex="BFILE") + public File BED; + + @Option(doc="The bim file name.", mutex="BFILE") + public File BIM; + + @Option(doc="The fam file name.", mutex="BFILE") + public File FAM; + + @Option(doc="The root file name of the bed, bim & fam files.", mutex={"BED", "BIM", "FAM"}) + public String BFILE; + + @Option(doc="The directory to write the output GELI files", shortName="D") + public File OUTPUT_DIR; + + @Option(doc="Set to 'true' if the family name should be included in the output file names, default false", + shortName="F", + optional=true) + public Boolean USE_FAMILY = Boolean.FALSE; + + @Option(doc="Name of file containing sequence dictionary to embed in new GELI files", + shortName="DICT") + public File SEQUENCE_DICTIONARY; + + private List snpCache; + private List geliFileNames; + private List sequenceDictionary; + private Map referenceIndexes; + + public static void main(String[] argv) { + System.exit(new BedToGeli().instanceMain(argv)); + } + + @Override + protected int doWork() { + populateFileNames(); + IoUtil.assertFileIsReadable(this.BED); + IoUtil.assertFileIsReadable(this.BIM); + IoUtil.assertFileIsReadable(this.FAM); + IoUtil.assertFileIsReadable(this.SEQUENCE_DICTIONARY); + IoUtil.assertDirectoryIsWritable(this.OUTPUT_DIR); + + populateSequenceDictionary(); + + BedFileReader bedReader = new BedFileReader(this.BED); + if (bedReader.getMode() == BedFileReader.MODE_INDIVIDUAL_MAJOR) { + log.debug("Detected BED file in individual-major mode"); + parseIndividualMajor(bedReader); + } else { + log.debug("Detected BED file in snp-major mode"); + parseSnpMajor(bedReader); + } + + return 0; + } + + /** + * loads the SEQUENCE_DICTIONARY file + */ + private void populateSequenceDictionary() { + try { + final SAMFileHeader header = new SAMTextHeaderCodec().decode(new AsciiLineReader(new FileInputStream(this.SEQUENCE_DICTIONARY)), null); + this.sequenceDictionary = header.getSequences(); + + this.referenceIndexes = new HashMap(); + for (byte i = 0; i < sequenceDictionary.size(); i++) { + this.referenceIndexes.put(sequenceDictionary.get(i).getSequenceName().intern(), i); + } + } catch (FileNotFoundException e) { + throw new PicardException("Unexpected exception", e); + } + } + + private void parseIndividualMajor(BedFileReader bedReader) { + cacheSnps(); + BasicTextFileParser famReader = new BasicTextFileParser(true, this.FAM); + for (String[] famFields : famReader) { + GeliFileWriter geliWriter = getGeliFileWriter(getGeliFileName(famFields[0], famFields[1]), false); + for (SNP snp : this.snpCache) { + GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods( + bedReader, snp); + if (genotypeLikelihoods != null) { + geliWriter.addGenotypeLikelihoods(genotypeLikelihoods); + } + } + bedReader.dropRemainingBlock(); + geliWriter.close(); + } + famReader.close(); + } + + /** + * @return null if for a no-call or the snp has no position on the genome + */ + private char[] getNextGenotype(BedFileReader bedReader, SNP snp) { + char[] genotype = new char[2]; + byte genotypeCode = bedReader.nextGenotype(); + if (snp == null) { + // unplaced marker... we need to read the genotype off the reader so we don't lose + // our place, but we cannot put the marker in the geli file. + return null; + } + switch (genotypeCode) { + case BedFileReader.GENOTYPE_AA: + genotype[0] = (char) snp.getAllele1(); + genotype[1] = (char) snp.getAllele1(); + break; + case BedFileReader.GENOTYPE_AB: + genotype[0] = (char) snp.getAllele1(); + genotype[1] = (char) snp.getAllele2(); + break; + case BedFileReader.GENOTYPE_BB: + genotype[0] = (char) snp.getAllele2(); + genotype[1] = (char) snp.getAllele2(); + break; + case BedFileReader.GENOTYPE_NO_CALL: + // don't record a genotype likelihood for a no call + return null; + default: + throw new PicardException("Unknown genotype code: " + Integer.toBinaryString(genotypeCode)); + } + return genotype; + } + + private void cacheSnps() { + BasicTextFileParser bimReader = null; + try { + bimReader = new BasicTextFileParser(true, this.BIM); + this.snpCache = new LinkedList(); + for (String[] bimFields : bimReader) { + SNP snp = constructSnp(bimFields); + snpCache.add(snp); + } + } finally { + try { + bimReader.close(); + } catch (Exception e) { + } + } + } + + private SNP constructSnp(String[] bimFields) { + byte referenceIndex = getReferenceIndex(bimFields[0]); + if (referenceIndex == -1) { + return null; + } + SNP snp = new SNP( + referenceIndex, + Integer.parseInt(bimFields[3]), + bimFields[4].toUpperCase().getBytes()[0], + bimFields[5].toUpperCase().getBytes()[0]); + return snp; + } + + /** + * determines the index in the sequence dictionary for the given chromosome + */ + private byte getReferenceIndex(String chromosome) { + final String referenceName; + int chromosomeNumber; + try { + chromosomeNumber = Integer.parseInt(chromosome); + } catch (NumberFormatException e) { + chromosomeNumber = -1; + } + + if (chromosomeNumber >= 1 && chromosomeNumber <= 22) { + referenceName = ("chr" + chromosome).intern(); + } else if (chromosomeNumber == 26 || chromosome.equalsIgnoreCase("MT")) { + referenceName = "chrM"; + } else if (chromosomeNumber == 23 || chromosomeNumber == 25 || + chromosome.equalsIgnoreCase("XY") || chromosome.equalsIgnoreCase("X")) { + referenceName = "chrX"; + } else if (chromosomeNumber == 24 || chromosome.equalsIgnoreCase("Y")) { + referenceName = "chrY"; + } else { + // unplaced marker + return -1; + } + + Byte referenceIndex = this.referenceIndexes.get(referenceName); + if (referenceIndex == null) { + throw new PicardException("Reference sequence [" + referenceName + "] not found in sequence dictionary"); + } + return referenceIndex; + } + + private void cacheGELIFileNames() { + BasicTextFileParser famReader = null; + try { + famReader = new BasicTextFileParser(true, this.FAM); + this.geliFileNames = new LinkedList(); + for (String[] fields : famReader) { + this.geliFileNames.add(getGeliFileName(fields[0], fields[1])); + } + } finally { + try { + famReader.close(); + } catch (Exception e) { + } + } + } + + private void parseSnpMajor(BedFileReader bedReader) { + cacheGELIFileNames(); + BasicTextFileParser bimReader = new BasicTextFileParser(true, this.BIM); + Map> likelihoodsByFile = + new HashMap>( + (int) Math.ceil(this.geliFileNames.size() * 1.34)); + + int maxRecordsInRam = calculateMaxRecordsInRam(); + for (String geliFileName : this.geliFileNames) { + likelihoodsByFile.put(geliFileName, SortingCollection.newInstance( + GenotypeLikelihoods.class, + new GenotypeLikelihoodsCodec(), + new GenotypeLikelihoodsComparator(), + maxRecordsInRam)); + } + + for (String[] bimFields : bimReader) { + for (String fileName : this.geliFileNames) { + SNP snp = constructSnp(bimFields); + GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods( + bedReader, snp); + if (genotypeLikelihoods != null) { + likelihoodsByFile.get(fileName).add(genotypeLikelihoods); + } + } + bedReader.dropRemainingBlock(); + } + bimReader.close(); + + writeGeliFiles(likelihoodsByFile); + } + + /** + * @return + */ + private int calculateMaxRecordsInRam() { + Runtime.getRuntime().gc(); + double memoryToUse = Runtime.getRuntime().maxMemory() * .8; // use up to 80% + int objectCountLimit = (int) (memoryToUse / GenotypeLikelihoods.OBJECT_SIZE_BYTES); + return objectCountLimit / this.geliFileNames.size(); + } + + /** + * @param likelihoodsByFile + */ + private void writeGeliFiles( + Map> likelihoodsByFile) { + + for (Map.Entry> entry : likelihoodsByFile.entrySet()) { + GeliFileWriter fileWriter = getGeliFileWriter(entry.getKey(), true); + for (GenotypeLikelihoods likelihoods : entry.getValue()) { + fileWriter.addGenotypeLikelihoods(likelihoods); + } + fileWriter.close(); + } + } + + private GeliFileWriter getGeliFileWriter( + String fileName, boolean presorted) { + File geliFile = new File(this.OUTPUT_DIR, fileName); + GeliFileWriter fileWriter = new GeliFileWriter(geliFile, presorted); + SAMFileHeader header = new SAMFileHeader(); + header.setAttribute(SAMFileHeader.VERSION_TAG, "1.0"); + header.setSequences(this.sequenceDictionary); + fileWriter.setHeader(header); + return fileWriter; + } + + /** + * @param bedReader + * @param snp + * @return + */ + private GenotypeLikelihoods constructGenotypeLikelihoods( + BedFileReader bedReader, SNP snp) { + char[] genotype = getNextGenotype(bedReader, snp); + if (genotype == null) { + // no call or unplaced marker + return null; + } + + GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods(); + genotypeLikelihoods.setLikelihood( + GenotypeLikelihoods.getLikelihoodIndex(genotype), + LIKELIHOOD); + genotypeLikelihoods.setReferenceIndex(snp.getReferenceIndex()); + genotypeLikelihoods.setPosition(snp.getPosition()); + return genotypeLikelihoods; + } + + /** + * populates bed/bim/fam if bfile option is used + */ + private void populateFileNames() { + if (this.BFILE != null) { + this.BED = new File(this.BFILE + ".bed"); + this.BIM = new File(this.BFILE + ".bim"); + this.FAM = new File(this.BFILE + ".fam"); + } + } + + /** + * @return the appropriate name taking into account this.USE_FAMILY + */ + private String getGeliFileName(String family, String individual) { + StringBuilder fileName = new StringBuilder(individual).append(".geli"); + if (this.USE_FAMILY) { + fileName.insert(0, "_").insert(0, family); + } + return fileName.toString(); + } +} diff --git a/lib/edu/mit/broad/picard/importer/genotype/SNP.java b/lib/edu/mit/broad/picard/importer/genotype/SNP.java new file mode 100644 index 0000000000..d9fce76cf4 --- /dev/null +++ b/lib/edu/mit/broad/picard/importer/genotype/SNP.java @@ -0,0 +1,35 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.importer.genotype; + +/** + * data class for storing snp info + * + * @author Doug Voet + */ +public class SNP { + private final byte referenceIndex; + private final int position; + private final byte allele1; + private final byte allele2; + + public SNP(byte chromosome, int position, byte allele1, byte allele2) { + this.referenceIndex = chromosome; + this.position = position; + this.allele1 = allele1; + this.allele2 = allele2; + } + + public byte getReferenceIndex() { return referenceIndex; } + public int getPosition() { return position; } + public byte getAllele1() { return allele1; } + public byte getAllele2() { return allele2; } +} diff --git a/lib/edu/mit/broad/picard/io/IoUtil.java b/lib/edu/mit/broad/picard/io/IoUtil.java new file mode 100644 index 0000000000..14688a7c14 --- /dev/null +++ b/lib/edu/mit/broad/picard/io/IoUtil.java @@ -0,0 +1,183 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.io; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +import edu.mit.broad.picard.PicardException; + +/** + * A class for utility methods that wrap or aggregate functionality in Java IO. + * + * @author Tim Fennell + */ +public class IoUtil { + /** + * Checks that a file is non-null, exists, is not a directory and is readable. If any + * condition is false then a runtime exception is thrown. + * + * @param file the file to check for readability + */ + public static void assertFileIsReadable(File file) { + if (file == null) { + throw new IllegalArgumentException("Cannot check readability of null file."); + } else if (!file.exists()) { + throw new PicardException("Cannot read non-existent file: " + file.getAbsolutePath()); + } + else if (file.isDirectory()) { + throw new PicardException("Cannot read file because it is a directory: " + file.getAbsolutePath()); + } + else if (!file.canRead()) { + throw new PicardException("File exists but is not readable: " + file.getAbsolutePath()); + } + } + + /** + * Checks that a file is non-null, and is either extent and writable, or non-existent but + * that the parent directory exists and is writable. If any + * condition is false then a runtime exception is thrown. + * + * @param file the file to check for writability + */ + public static void assertFileIsWritable(File file) { + if (file == null) { + throw new IllegalArgumentException("Cannot check readability of null file."); + } else if (!file.exists()) { + // If the file doesn't exist, check that it's parent directory does and is writable + File parent = file.getAbsoluteFile().getParentFile(); + if (!parent.exists()) { + throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " + + "Neither file nor parent directory exist."); + } + else if (!parent.isDirectory()) { + throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " + + "File does not exist and parent is not a directory."); + } + else if (!parent.canWrite()) { + throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " + + "File does not exist and parent directory is not writable.."); + } + } + else if (file.isDirectory()) { + throw new PicardException("Cannot write file because it is a directory: " + file.getAbsolutePath()); + } + else if (!file.canWrite()) { + throw new PicardException("File exists but is not writable: " + file.getAbsolutePath()); + } + } + + /** + * Checks that a directory is non-null, extent, writable and a directory + * otherwise a runtime exception is thrown. + * + * @param dir the dir to check for writability + */ + public static void assertDirectoryIsWritable(File dir) { + if (dir == null) { + throw new IllegalArgumentException("Cannot check readability of null file."); + } + else if (!dir.exists()) { + throw new PicardException("Directory does not exist: " + dir.getAbsolutePath()); + } + else if (!dir.isDirectory()) { + throw new PicardException("Cannot write to directory because it is not a directory: " + dir.getAbsolutePath()); + } + else if (!dir.canWrite()) { + throw new PicardException("Directory exists but is not writable: " + dir.getAbsolutePath()); + } + } + + /** + * Opens a file for reading, decompressing it if necessary + * + * @param file The file to open + * @return the input stream to read from + */ + public static InputStream openFileForReading(File file) { + + try { + if (file.getName().endsWith(".gz") || + file.getName().endsWith(".bfq") || + file.getName().endsWith(".map")) { + return new GZIPInputStream(new FileInputStream(file)); + } + //TODO: Other compression formats + else { + return new FileInputStream(file); + } + } + catch (IOException ioe) { + throw new PicardException("File not found: " + file.getName(), ioe); + } + + } + + /** + * Opens a file for writing, overwriting the file if it already exists + * + * @param file the file to write to + * @return the output stream to write to + */ + public static OutputStream openFileForWriting(File file) { + return openFileForWriting(file, false); + } + + /** + * Opens a file for writing + * + * @param file the file to write to + * @param append whether to append to the file if it already exists (we overwrite it if false) + * @return the output stream to write to + */ + public static OutputStream openFileForWriting(File file, boolean append) { + + try { + if (file.getName().endsWith(".gz") || + file.getName().endsWith(".bfq") || + file.getName().endsWith(".map")) { + return new GZIPOutputStream(new FileOutputStream(file, append)); + } + //TODO: Other compression formats + else { + return new FileOutputStream(file, append); + } + } + catch (IOException ioe) { + throw new PicardException("Error opening file for writing: " + file.getName(), ioe); + } + } + + /** + * Utility method to copy the contents of input to output. The caller is responsible for + * opening and closing both streams. + * + * @param input contents to be copied + * @param output destination + */ + public static void copyStream(InputStream input, OutputStream output) { + try { + byte[] buffer = new byte[1024]; + int bytesRead = 0; + while((bytesRead = input.read(buffer)) > 0) { + output.write(buffer, 0, bytesRead); + } + } catch (IOException e) { + throw new PicardException("Exception copying stream", e); + } + } + +} diff --git a/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java b/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java new file mode 100644 index 0000000000..fa611ff091 --- /dev/null +++ b/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java @@ -0,0 +1,50 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.metrics; + +import edu.mit.broad.sam.SAMRecord; + +public class AggregateMetricCollector implements MetricCollector { + private final MetricCollector[] collectors; + + public AggregateMetricCollector(MetricCollector... collectors) { + if (collectors.length == 0) { + throw new IllegalArgumentException("Must supply at least one collector."); + } + this.collectors = collectors; + } + + @Override + public void addRecord(SAMRecord record) { + for (MetricCollector collector : this.collectors) { + collector.addRecord(record); + } + } + + @Override + public void onComplete() { + for (MetricCollector collector : this.collectors) { + collector.onComplete(); + } + } + + @Override + public void setMetrics(T metrics) { + for (MetricCollector collector : this.collectors) { + collector.setMetrics(metrics); + } + } + + @Override + public T getMetrics() { + return this.collectors[0].getMetrics(); + } +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/metrics/Header.java b/lib/edu/mit/broad/picard/metrics/Header.java new file mode 100644 index 0000000000..3ae8f21794 --- /dev/null +++ b/lib/edu/mit/broad/picard/metrics/Header.java @@ -0,0 +1,17 @@ +package edu.mit.broad.picard.metrics; + +/** + * A header for a metrics file. A header simply consists of a type and some arbitrary + * data, but must be able to turn itself into a String and parse it's data back out + * of that String at a later date. + * + * @author Tim Fennell + */ +public interface Header { + /** Converts the header to a String for persisting to a file. */ + public String toString(); + + /** Parses the data contained in the String version of the header. */ + public void parse(String in); + +} diff --git a/lib/edu/mit/broad/picard/metrics/MetricBase.java b/lib/edu/mit/broad/picard/metrics/MetricBase.java new file mode 100644 index 0000000000..21c1226cd7 --- /dev/null +++ b/lib/edu/mit/broad/picard/metrics/MetricBase.java @@ -0,0 +1,77 @@ +package edu.mit.broad.picard.metrics; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.util.FormatUtil; + +import java.lang.reflect.Field; + +/** + * A base class from which all Metric classes should inherit. + * + * @author Tim Fennell + */ +public class MetricBase { + /** + * An equals method that checks equality by asserting that the classes are of the exact + * same type and that all public fields are equal. + * + * @param o an instance to compare to + * @return true if they are equal, false otherwise + */ + public boolean equals(Object o) { + if (o == null) return false; + if (o.getClass() != getClass()) return false; + + // Loop through all the fields and check that they are either + // null in both objects or equal in both objects + for (Field f : getClass().getFields()) { + try { + Object lhs = f.get(this); + Object rhs = f.get(o); + + if (lhs == null) { + if (rhs == null) { + // keep going + } + else if (rhs != null) { + return false; + } + } + else { + if (lhs.equals(rhs)) { + // keep going + } + else { + return false; + } + } + } + catch (IllegalAccessException iae) { + throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName()); + } + } + + // If we got this far all the fields are equal + return true; + } + + /** Converts the metric class to a human readable string. */ + public String toString() { + StringBuilder buffer = new StringBuilder(); + FormatUtil formatter = new FormatUtil(); + + for (Field f : getClass().getFields()) { + try { + buffer.append(f.getName()); + buffer.append("\t"); + buffer.append(formatter.format(f.get(this))); + buffer.append("\n"); + } + catch (IllegalAccessException iae) { + throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName()); + } + } + + return buffer.toString(); + } +} diff --git a/lib/edu/mit/broad/picard/metrics/MetricCollector.java b/lib/edu/mit/broad/picard/metrics/MetricCollector.java new file mode 100644 index 0000000000..e84fed450e --- /dev/null +++ b/lib/edu/mit/broad/picard/metrics/MetricCollector.java @@ -0,0 +1,24 @@ +package edu.mit.broad.picard.metrics; + +import edu.mit.broad.sam.SAMRecord; + +/** + * Interface for objects that collect metrics about SAMRecords. + */ +public interface MetricCollector { + T getMetrics(); + + /** Called after collector is constructed to populate the metrics object. */ + void setMetrics(T metrics); + + /** + * Called when collection is complete. Implementations can do any calculations + * that must wait until all records are visited at this time. + */ + void onComplete(); + + /** + * Visitor method called to have the record considered by the collector. + */ + void addRecord(SAMRecord record); +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/metrics/MetricsFile.java b/lib/edu/mit/broad/picard/metrics/MetricsFile.java new file mode 100644 index 0000000000..72c6da8423 --- /dev/null +++ b/lib/edu/mit/broad/picard/metrics/MetricsFile.java @@ -0,0 +1,370 @@ +package edu.mit.broad.picard.metrics; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Reader; +import java.io.Writer; +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.util.FormatUtil; +import edu.mit.broad.picard.util.Histogram; +import edu.mit.broad.picard.util.StringUtil; + +/** + * Contains a set of metrics that can be written to a file and parsed back + * again. The set of metrics is composed of zero or more instances of a class, + * BEAN, that extends {@link MetricBase} (all instances must be of the same type) + * and may optionally include a histogram of data. + * + * @author Tim Fennell + */ +public class MetricsFile { + public static final String MAJOR_HEADER_PREFIX = "## "; + public static final String MINOR_HEADER_PREFIX = "# "; + public static final String SEPARATOR = "\t"; + public static final String HISTO_HEADER = "## HISTOGRAM\t"; + public static final String METRIC_HEADER = "## METRICS CLASS\t"; + + private List

    headers = new ArrayList
    (); + private List metrics = new ArrayList(); + private Histogram histogram; + + /** Adds a header to the collection of metrics. */ + public void addHeader(Header h) { this.headers.add(h); } + + /** Returns the list of headers. */ + public List
    getHeaders() { return Collections.unmodifiableList(this.headers); } + + /** Adds a bean to the collection of metrics. */ + public void addMetric(BEAN bean) { this.metrics.add(bean); } + + /** Returns the list of headers. */ + public List getMetrics() { return Collections.unmodifiableList(this.metrics); } + + /** Returns the histogram contained in the metrics file if any. */ + public Histogram getHistogram() { return histogram; } + + /** Sets the histogram contained in the metrics file. */ + public void setHistogram(Histogram histogram) { this.histogram = histogram; } + + /** Returns the list of headers with the specified type. */ + public List
    getHeaders(Class type) { + List
    tmp = new ArrayList
    (); + for (Header h : this.headers) { + if (h.getClass().equals(type)) { + tmp.add(h); + } + } + + return tmp; + } + + /** + * Writes out the metrics file to the supplied file. The file is written out + * headers first, metrics second and histogram third. + * + * @param f a File into which to write the metrics + */ + public void write(File f) { + FileWriter w = null; + try { + w = new FileWriter(f); + write(w); + } + catch (IOException ioe) { + throw new PicardException("Could not write metrics to file: " + f.getAbsolutePath(), ioe); + } + finally { + if (w != null) { + try { + w.close(); + } catch (IOException e) { + } + } + } + } + + /** + * Writes out the metrics file to the supplied writer. The file is written out + * headers first, metrics second and histogram third. + * + * @param w a Writer into which to write the metrics + */ + public void write(Writer w) { + try { + FormatUtil formatter = new FormatUtil(); + BufferedWriter out = new BufferedWriter(w); + printHeaders(out); + out.newLine(); + + printBeanMetrics(out, formatter); + out.newLine(); + + printHistogram(out, formatter); + out.newLine(); + out.flush(); + } + catch (IOException ioe) { + throw new PicardException("Could not write metrics file.", ioe); + } + } + + /** Prints the headers into the provided PrintWriter. */ + private void printHeaders(BufferedWriter out) throws IOException { + for (Header h : this.headers) { + out.append(MAJOR_HEADER_PREFIX); + out.append(h.getClass().getName()); + out.newLine(); + out.append(MINOR_HEADER_PREFIX); + out.append(h.toString()); + out.newLine(); + } + } + + /** Prints each of the metrics entries into the provided PrintWriter. */ + private void printBeanMetrics(BufferedWriter out, FormatUtil formatter) throws IOException { + if (this.metrics.isEmpty()) { + return; + } + + // Write out a header row with the type of the metric class + out.append(METRIC_HEADER + getBeanType().getName()); + out.newLine(); + + // Write out the column headers + Field[] fields = getBeanType().getFields(); + final int fieldCount = fields.length; + + for (int i=0; i.Bin bin : this.histogram.values()) { + out.append(StringUtil.assertCharactersNotInString(formatter.format(bin.getId()), '\t', '\n')); + out.append(MetricsFile.SEPARATOR); + out.append(formatter.format(bin.getValue())); + out.newLine(); + } + } + } + + /** Gets the type of the metrics bean being used. */ + private Class getBeanType() { + if (this.metrics == null || this.metrics.isEmpty()) { + return null; + } else { + return this.metrics.get(0).getClass(); + } + } + + /** Reads the Metrics in from the given reader. */ + public void read(Reader r) { + BufferedReader in = new BufferedReader(r); + FormatUtil formatter = new FormatUtil(); + String line = null; + + try { + // First read the headers + Header header = null; + boolean inHeader = true; + while ((line = in.readLine()) != null && inHeader) { + line = line.trim(); + // A blank line signals the end of the headers, otherwise parse out + // the header types and values and build the headers. + if ("".equals(line)) { + inHeader = false; + } + else if (line.startsWith(MAJOR_HEADER_PREFIX)) { + if (header != null) { + throw new IllegalStateException("Consecutive header class lines encountered."); + } + + String className = line.substring(MAJOR_HEADER_PREFIX.length()).trim(); + try { + header = (Header) Class.forName(className).newInstance(); + } + catch (Exception e) { + throw new PicardException("Error load and/or instantiating an instance of " + className, e); + } + } + else if (line.startsWith(MINOR_HEADER_PREFIX)) { + if (header == null) { + throw new IllegalStateException("Header class must precede header value:" + line); + } + header.parse(line.substring(MINOR_HEADER_PREFIX.length())); + this.headers.add(header); + header = null; + } + else { + throw new PicardException("Illegal state. Found following string in metrics file header: " + line); + } + } + + // Then read the metrics if there are any + while (!line.startsWith(MAJOR_HEADER_PREFIX)) { + line = in.readLine().trim(); + } + if (line.startsWith(METRIC_HEADER)) { + // Get the metric class from the header + String className = line.split(SEPARATOR)[1]; + Class type = null; + try { + type = Class.forName(className); + } + catch (ClassNotFoundException cnfe) { + throw new PicardException("Could not locate class with name " + className, cnfe); + } + + // Read the next line with the column headers + String[] fieldNames = in.readLine().split(SEPARATOR); + Field[] fields = new Field[fieldNames.length]; + for (int i=0; i 0) { + value = formatter.parseObject(values[i], fields[i].getType()); + } + + try { fields[i].set(bean, value); } + catch (Exception e) { + throw new PicardException("Error setting field " + fields[i].getName() + + " on class of type " + type.getName(), e); + } + } + + this.metrics.add(bean); + } + } + } + + // Then read the histogram if it is present + while (line != null && !line.startsWith(MAJOR_HEADER_PREFIX)) { + line = in.readLine(); + } + if (line != null && line.startsWith(HISTO_HEADER)) { + // Get the key type of the histogram + String keyClassName = line.split(SEPARATOR)[1].trim(); + Class keyClass = null; + + try { keyClass = Class.forName(keyClassName); } + catch (ClassNotFoundException cnfe) { throw new PicardException("Could not load class with name " + keyClassName); } + + // Read the next line with the bin and value labels + String[] labels = in.readLine().split(SEPARATOR); + this.histogram = new Histogram(labels[0], labels[1]); + + // Read the entries in the histogram + while ((line = in.readLine()) != null && !"".equals(line)) { + String[] fields = line.trim().split(SEPARATOR); + HKEY key = (HKEY) formatter.parseObject(fields[0], keyClass); + double value = formatter.parseDouble(fields[1]); + this.histogram.increment(key, value); + } + } + } + catch (IOException ioe) { + throw new PicardException("Could not read metrics from reader.", ioe); + } + } + + /** Checks that the headers, metrics and histogram are all equal. */ + @Override + public boolean equals(Object o) { + if (getClass() != o.getClass()) { + return false; + } + MetricsFile that = (MetricsFile) o; + + if (!this.headers.equals(that.headers)) { + return false; + } + if (!this.metrics.equals(that.metrics)) { + return false; + } + if (this.histogram == null && that.histogram == null) { + return true; + } else if (this.histogram != null) { + return this.histogram.equals(that.histogram); + } else if (that.histogram != null) { + return that.histogram.equals(this.histogram); + } + + return true; + } +} diff --git a/lib/edu/mit/broad/picard/metrics/StringHeader.java b/lib/edu/mit/broad/picard/metrics/StringHeader.java new file mode 100644 index 0000000000..6798def882 --- /dev/null +++ b/lib/edu/mit/broad/picard/metrics/StringHeader.java @@ -0,0 +1,43 @@ +package edu.mit.broad.picard.metrics; + +import edu.mit.broad.picard.util.StringUtil; + +/** + * A simple header who's data type is a single String. Should not be used for anything other + * than comments or descriptive text. + * + * @author Tim Fennell + */ +public class StringHeader implements Header { + private String value; + + /** Default constructor. */ + public StringHeader() {} + + /** Constructor that uses the supplied value as the value of the header. */ + public StringHeader(String value) { + setValue(value); + } + + public void parse(String in) { value = in.trim(); } + public String toString() { return value; } + + public String getValue() { return value; } + public void setValue(String value) { this.value = StringUtil.assertCharactersNotInString(value, '\n'); } + + /** Checks equality on the value of the header. */ + public boolean equals(Object o) { + if (o != null && o instanceof StringHeader) { + StringHeader that = (StringHeader) o; + if (this.value == null) { + return that.value == null; + } + else { + return this.value.equals(that.value); + } + } + else { + return false; + } + } +} diff --git a/lib/edu/mit/broad/picard/metrics/VersionHeader.java b/lib/edu/mit/broad/picard/metrics/VersionHeader.java new file mode 100644 index 0000000000..665f39ecf9 --- /dev/null +++ b/lib/edu/mit/broad/picard/metrics/VersionHeader.java @@ -0,0 +1,50 @@ +package edu.mit.broad.picard.metrics; + +import edu.mit.broad.picard.util.StringUtil; + +/** + * Header that stores information about the version of some piece of software or + * data used to create the metrics file. Payload consists of a name or description + * of the versioned item and a version string. + * + * @author Tim Fennell + */ +public class VersionHeader implements Header { + private String versionedItem; + private String versionString; + + public void parse(String in) { + String[] fields = in.split("\t"); + this.versionedItem = fields[0]; + this.versionString = fields[1]; + } + + public String toString() { + return this.versionedItem + "\t" + this.versionString; + } + + public String getVersionedItem() { return versionedItem; } + public void setVersionedItem(String versionedItem) { + this.versionedItem = StringUtil.assertCharactersNotInString(versionedItem, '\t', '\n'); + } + + public String getVersionString() { return versionString; } + public void setVersionString(String versionString) { + this.versionString = StringUtil.assertCharactersNotInString(versionString, '\t', '\n'); + } + + /** Equals method that checks that both the item and version string are equal. */ + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + VersionHeader that = (VersionHeader) o; + + if (versionString != null ? !versionString.equals(that.versionString) : that.versionString != null) + return false; + if (versionedItem != null ? !versionedItem.equals(that.versionedItem) : that.versionedItem != null) + return false; + + return true; + } +} diff --git a/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java b/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java new file mode 100644 index 0000000000..9aa59618f6 --- /dev/null +++ b/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java @@ -0,0 +1,148 @@ +package edu.mit.broad.picard.quality; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.picard.variation.DbSnpFileReader; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMFileWriter; +import edu.mit.broad.sam.SAMFileWriterFactory; +import edu.mit.broad.sam.SAMRecord; + +import java.io.File; +import java.io.PrintStream; + +/** + * Command line program to calibrate quality scores using alignment and dbsnp data. Calibrates + * qualities cycle by cycle and separately for reads one and two in a pair. Bases that fall + * within dbSNP loci are ignored otherwise the empircal mismatch rate is calculated for + * each quality at each cycle and used to calculate the calibrated quality value. + * + * @author Tim Fennell + */ +public class CalibrateQualityScores extends CommandLineProgram { + @Option(shortName="A", doc="A file of aligned reads in SAM or BAM format") + public File ALIGNED_SAM; + + @Option(shortName="I", doc="A SAM or BAM file to rewrite with calibrated qualities. If omitted ALIGNED_SAM is used.", optional=true) + public File INPUT; + + @Option(shortName="O", doc="The SAM or BAM file to write with updated qualities.") + public File OUTPUT; + + @Option(shortName="R", doc="Reference sequence file") + public File REFERENCE; + + @Option(shortName="SNP", doc="Binary file of dbSNP information", optional=true) + public File DBSNP_FILE; + + @Option(shortName="TABLE", doc="A file to output the calibration table(s) to.") + public File CALIBRATION_TABLE_OUT; + + @Option(doc="Optional limit to the number of aligned reads that should be procesed", optional=true) + public Integer READ_LIMIT = -1; + + /** Stock main method for a command line program. */ + public static void main(String[] argv) { + System.exit(new CalibrateQualityScores().instanceMain(argv)); + } + + /** + * Main method for the program. Checks that all input files are present and + * readable and that the output file can be written to. Then loads up all the + * data and calibrates the quality scores and proceeds to write an output file + * with calibrated quality scores instead of the input quality scores. + */ + protected int doWork() { + final Log log = Log.getInstance(getClass()); + + // Some quick parameter checking + if (INPUT == null) INPUT = ALIGNED_SAM; + + IoUtil.assertFileIsReadable(ALIGNED_SAM); + IoUtil.assertFileIsReadable(REFERENCE); + IoUtil.assertFileIsReadable(INPUT); + IoUtil.assertFileIsWritable(OUTPUT); + IoUtil.assertFileIsWritable(CALIBRATION_TABLE_OUT); + + log.info("Reading input files and calculating calibration matrices."); + + // Load things up and calculate the quality score calibrations + SAMFileReader sam = new SAMFileReader(ALIGNED_SAM); + ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE); + DbSnpFileReader dbsnp = null; + + if (DBSNP_FILE != null) { + IoUtil.assertFileIsReadable(DBSNP_FILE); + dbsnp = new DbSnpFileReader(DBSNP_FILE); + } + + QualityScoreCalibrator calibrator = new QualityScoreCalibrator(sam, ref, dbsnp); + calibrator.calibrate(READ_LIMIT); + + // Dump the calibration tables + log.info("Writing out calibration table."); + PrintStream stream = new PrintStream(IoUtil.openFileForWriting(CALIBRATION_TABLE_OUT)); + stream.println("Read 1 Calibration Table:"); + print(stream, calibrator.getRead1Matrix().getCalibratedQualities()); + + if (!calibrator.getRead2Matrix().isEmpty()) { + stream.println(); + stream.println("Read 2 Calibration Table:"); + print(stream, calibrator.getRead2Matrix().getCalibratedQualities()); + } + + // And then load up the input and rewrite with calibrated qualities + log.info("Writing file with calibrated qualities."); + SAMFileReader in = new SAMFileReader(INPUT); + SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT); + + for (SAMRecord rec : in) { + byte[] quals = rec.getBaseQualities(); + byte[] calibrated = new byte[quals.length]; + QualityScoreMatrix matrix = rec.getFirstOfPairFlag() ? calibrator.getRead1Matrix() : calibrator.getRead2Matrix(); + + for (int i=0; i samIterator = this.sam.iterator(); + SAMRecord read = samIterator.next(); + int readsProcessed = 0; + + // Quality score matrixes for reads 1 and 2 separately + this.read1Matrix = new QualityScoreMatrix(); + this.read2Matrix = new QualityScoreMatrix(); + + + refloop: while ((reference = this.ref.nextSequence()) != null) { + final byte[] refBases = reference.getBases(); + final BitSet snps = getDbSnpMask(reference); + + while (read != null && read.getReferenceIndex(header) == reference.getContigIndex()) { + if (!read.getReadUnmappedFlag() && !read.getNotPrimaryAlignmentFlag()) { + final QualityScoreMatrix matrix = read.getFirstOfPairFlag() ? this.read1Matrix : this.read2Matrix; + final byte[] readBases = read.getReadBases(); + final byte[] qualities = read.getBaseQualities(); + + for (AlignmentBlock block : read.getAlignmentBlocks()) { + final int readIndex = block.getReadStart() - 1; + final int refIndex = block.getReferenceStart() - 1; + final int length = block.getLength(); + + for (int i=0; i 0 && ++readsProcessed >= readLimit) { + break refloop; + } + } + + // Advance the sam iterator + if (samIterator.hasNext()) { + read = samIterator.next(); + } + else { + read = null; + } + } + } + + this.read1Matrix.computeCalibratedQualities(); + if (!this.read2Matrix.isEmpty()) this.read2Matrix.computeCalibratedQualities(); + } + + /** Gets the calibration matrix for the first read. */ + public QualityScoreMatrix getRead1Matrix() { return read1Matrix; } + + /** Gets the calibration matrix for the second read. May be empty if there was no second read data. */ + public QualityScoreMatrix getRead2Matrix() { return read2Matrix; } + + /** + * Returns a BitSet that denotes whether a dbSNP entry is present at each + * base in the reference sequence. The set is reference.length() + 1 so that + * it can be indexed by 1-based reference base. True means dbSNP present, + * false means no dbSNP present. + */ + private BitSet getDbSnpMask(ReferenceSequence reference) { + int index = reference.getContigIndex(); + BitSet bits = new BitSet(reference.length() + 1); + + /* Just return an all false bit set if we don't have dbsnp data. */ + if (this.dbsnp == null) { + return bits; + } + + /* Read off the next contig's worth of data. */ + while (this.dbsnp.hasNext()) { + KnownVariant variant = this.dbsnp.peek(); + + if (variant.getSequenceIndex() < index) { + this.dbsnp.next(); + } + else if (variant.getSequenceIndex() == index) { + variant = this.dbsnp.next(); + + for (int i=variant.getStartPos(); i<=variant.getEndPos(); ++i) { + bits.set(i, true); + } + } + else { + break; + } + } + + return bits; + } +} diff --git a/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java b/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java new file mode 100644 index 0000000000..c5c1674c68 --- /dev/null +++ b/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java @@ -0,0 +1,133 @@ +package edu.mit.broad.picard.quality; + +import edu.mit.broad.picard.util.Histogram; + +import java.util.TreeMap; +import java.util.Map; +import java.util.SortedMap; + +/** + *

    Holds all the information necessary to perform quality score calibration for a single + * end/read for a lane or run of sequencing. General usage is to construct an instance + * an call {@link #addObservation(int, int, boolean)} repeatedly and when all input data + * is consumed call {@link #computeCalibratedQualities()}.

    + * + *

    Once this is done then {@link #getCalibratedQualities()} can be called to get a matrix + * of quality score calibrations by cycle and input quality. However it is preferred to call + * {@link #getCalibratedQuality(int, int)} which will attempt to infer the correct value in the + * case that the input quality was not observed in the training data.

    + * + * @author Tim Fennell + */ +public class QualityScoreMatrix { + // Maps by cycle, histograms by quality + private SortedMap> observations = new TreeMap>(); + private SortedMap> errors = new TreeMap>(); + + private int[][] calibratedQualities = null; + + /** + * Adds an observation to the matrix. + * @param cycle the cycle in the read (1-based) + * @param quality the uncalibrated quality + * @param error true if the base did not match the reference, false otherwise + */ + public void addObservation(int cycle, int quality, boolean error) { + Histogram obs = this.observations.get(cycle); + if (obs == null) { + obs = new Histogram(); + this.observations.put(cycle, obs); + } + obs.increment(quality); + + if (error) { + Histogram errs = this.errors.get(cycle); + if (errs == null) { + errs = new Histogram(); + this.errors.put(cycle, errs); + } + errs.increment(quality); + } + } + + /** + * Takes the input observations so far and builds a matrix of input cycle and + * uncalibrated quality to calibrated quality value. + */ + public void computeCalibratedQualities() { + this.calibratedQualities = new int[this.observations.lastKey() + 1][]; + + for (int cycle=1; cycle obs = this.observations.get(cycle); + Histogram err = this.errors.get(cycle); + + this.calibratedQualities[cycle] = new int[obs.lastKey() + 1]; + + for (Integer qual : obs.keySet()) { + double o = obs.get(qual).getValue(); + Histogram.Bin errBin = err.get(qual); + double e = (errBin == null) ? 1 : errBin.getValue(); + + this.calibratedQualities[cycle][qual] = computePhredScore(e, o); + } + } + } + + /** + * Returns the set of calibrated quality scores from the training data. The array is + * indexed first by the cycle (1-based, index 0 is empty) and then by input quality + * (again, the actualy quality, not shifted). + * + * @return an array of calibrated qualities for the read + */ + public int[][] getCalibratedQualities() { + return calibratedQualities; + } + + /** + * Accesses the calibrated quality for the given input cycle and quality. If the quality + * is outside the range given in the training data then the upper or lower bound of + * the calibrated qualities is used instead. + * + * @param cycle the input cycle (1-based) + * @param quality the uncalibrated quality + * @return the calibrated quality for the cycle and uncalibrated quality + */ + public final int getCalibratedQuality(int cycle, int quality) { + final int[] quals = this.calibratedQualities[cycle]; + + // TODO: proper iterpolation where we don't have the right quality + try { + int retval = quals[quality]; + + // If we didn't calibrate this quality value, search up and down for non-zero + for (int i=quality; i>0 && retval == 0; --i) { + if (quals[i] != 0) retval = quals[i]; + } + + for (int i=quality; i sequenceDictionary; + private String cachedLine = null; + private int index = -1; + + /** Constructs a FastaSequenceFile that reads from the specified file. */ + FastaSequenceFile(File file) { + this.file = file; + this.in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file))); + + // Try and locate the dictionary + String dictionaryName = file.getAbsolutePath(); + dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta")); + dictionaryName += ".dict"; + File dictionary = new File(dictionaryName); + if (dictionary.exists()) { + IoUtil.assertFileIsReadable(dictionary); + + try { + SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); + SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary); + if (header.getSequences() != null && header.getSequences().size() > 0) { + this.sequenceDictionary = header.getSequences(); + } + } + catch (Exception e) { + throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e); + } + } + } + + /** + * Returns the list of sequence records associated with the reference sequence if found + * otherwise null. + */ + public List getSequenceDictionary() { + return this.sequenceDictionary; + } + + public ReferenceSequence nextSequence() { + String line = null; + String name = null; + + // Scan forward to a header line + while ((line = readNextLine()) != null) { + if (line.startsWith(">")) { + name = line.substring(1).trim(); + this.index += 1; + break; + } + } + + // No more! + if (name == null) return null; + + // Read the sequence + int basesRead = 0; + byte[] bases = new byte[250000000]; // big enough to hold human chr1! + while ((line = readNextLine()) != null) { + if (line.startsWith(">")) { + pushBackLine(line); + break; + } + else { + final byte[] nextBases = line.getBytes(ASCII); + final int lineLength = nextBases.length; + + // If the array isn't big enough to hold the next chunk, resize it + if (basesRead + lineLength > bases.length) { + byte[] tmp = new byte[bases.length * 2]; + System.arraycopy(bases, 0, tmp, 0, basesRead); + bases = tmp; + } + + // Now shunt the most recent bases onto the end of the array + System.arraycopy(nextBases, 0, bases, basesRead, lineLength); + basesRead += lineLength; + } + } + + // And lastly resize the array down to the right size + if (basesRead != bases.length) { + byte[] tmp = new byte[basesRead]; + System.arraycopy(bases, 0, tmp, 0, basesRead); + bases = tmp; + } + + return new ReferenceSequence(name, this.index, bases); + } + + /** + * Reads the next line from the file, or if we've saved a line earlier, returns that + * instead. + */ + private String readNextLine() { + // If we have a cached line use it + if (this.cachedLine != null) { + String tmp = this.cachedLine; + this.cachedLine = null; + return tmp; + } + else { + try { return this.in.readLine(); } + catch (IOException ioe) { + throw new PicardException("Error reading line from file: " + this.file.getAbsolutePath(), ioe); + } + } + } + + /** Pushed a line back so that the next call to readNextLine() will return it. */ + private void pushBackLine(String line) { + this.cachedLine = line; + } +} + diff --git a/lib/edu/mit/broad/picard/reference/ReferenceSequence.java b/lib/edu/mit/broad/picard/reference/ReferenceSequence.java new file mode 100644 index 0000000000..24aebc7b54 --- /dev/null +++ b/lib/edu/mit/broad/picard/reference/ReferenceSequence.java @@ -0,0 +1,48 @@ +package edu.mit.broad.picard.reference; + +/** + * Wrapper around a reference sequence that has been read from a reference file. + * + * @author Tim Fennell + */ +public class ReferenceSequence { + private String name; + private byte[] bases; + private int contigIndex; + private int length; + + /** + * Package level constructor that creates a fully formed ReferenceSequence + * + * @param name the name of the sequence from the source file + * @param index the zero based index of this contig in the source file + * @param bases the bases themselves stored as one-byte characters + */ + ReferenceSequence(String name, int index, byte[] bases) { + this.name = name; + this.contigIndex = index; + this.bases = bases; + this.length = bases.length; + } + + /** Gets the set of names given to this sequence in the source file. */ + public String getName() { return name; } + + /** + * Gets the array of bases that define this sequence. The bases can include any + * letter and possibly include masking information in the form of lower case + * letters. This array is mutable (obviously!) and it NOT a clone of the array + * held interally. Do not modify it!!! + */ + public byte[] getBases() { return bases; } + + /** Gets the 0-based index of this contig in the source file from which it came. */ + public int getContigIndex() { return contigIndex; } + + /** Gets the length of this reference sequence in bases. */ + public int length() { return length; } + + public String toString() { + return "ReferenceSequence " + getName(); + } +} diff --git a/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java b/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java new file mode 100644 index 0000000000..34accc3f6c --- /dev/null +++ b/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java @@ -0,0 +1,29 @@ +package edu.mit.broad.picard.reference; + +import edu.mit.broad.sam.SAMSequenceRecord; + +import java.util.List; + +/** + * An interface for working with files of reference sequences regardless of the file format + * being used. + * + * @author Tim Fennell + */ +public interface ReferenceSequenceFile { + + /** + * Must return a sequence dictionary with at least the following fields completed + * for each sequence: name, length. + * + * @return a list of sequence records representing the sequences in this reference file + */ + public List getSequenceDictionary(); + + /** + * Retrieves the next whole sequences from the file. + * @return a ReferenceSequence or null if at the end of the file + */ + public ReferenceSequence nextSequence(); + +} diff --git a/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java b/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java new file mode 100644 index 0000000000..57b5907d1d --- /dev/null +++ b/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java @@ -0,0 +1,28 @@ +package edu.mit.broad.picard.reference; + +import java.io.File; + +/** + * Factory class for creating ReferenceSequenceFile instances for reading reference + * sequences store in various formats. + * + * @author Tim Fennell + */ +public class ReferenceSequenceFileFactory { + + /** + * Attempts to determine the type of the reference file and return an instance + * of ReferenceSequenceFile that is appropriate to read it. + * + * @param file the reference sequence file on disk + */ + public static ReferenceSequenceFile getReferenceSequenceFile(File file) { + String name = file.getName(); + if (name.endsWith(".fasta") || name.endsWith("fasta.gz") || name.endsWith(".txt") || name.endsWith(".txt.gz")) { + return new FastaSequenceFile(file); + } + else { + throw new IllegalArgumentException("File is not a supported reference file type: " + file.getAbsolutePath()); + } + } +} diff --git a/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java b/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java new file mode 100644 index 0000000000..a3bc8fed8e --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java @@ -0,0 +1,352 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ + +package edu.mit.broad.picard.sam; + +import java.io.File; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.metrics.AggregateMetricCollector; +import edu.mit.broad.picard.metrics.MetricBase; +import edu.mit.broad.picard.metrics.MetricCollector; +import edu.mit.broad.picard.metrics.MetricsFile; +import edu.mit.broad.picard.metrics.StringHeader; +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.picard.sam.CollectAlignmentSummaryMetrics.AlignmentSummaryMetrics.Type; +import edu.mit.broad.picard.util.CoordMath; +import edu.mit.broad.picard.util.Histogram; +import edu.mit.broad.picard.util.SequenceUtil; +import edu.mit.broad.sam.AlignmentBlock; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sam.util.CloseableIterator; + +/** + * A command line tool to read a BAM file and produce standard alignment metrics that would be applicable to any alignment. + * Metrics to include, but not limited to: + *
      + *
    • Total number of reads (total, period, no exclusions)
    • + *
    • Total number of PF reads (PF == does not fail vendor check flag)
    • + *
    • Number of PF noise reads (does not fail vendor check and has noise attr set)
    • + *
    • Total aligned PF reads (any PF read that has a sequence and position)
    • + *
    • High quality aligned PF reads (high quality == mapping quality >= 20)
    • + *
    • High quality aligned PF bases (actual aligned bases, calculate off alignment blocks)
    • + *
    • High quality aligned PF Q20 bases (subset of above where base quality >= 20)
    • + *
    • Median mismatches in HQ aligned PF reads (how many aligned bases != ref on average)
    • + *
    • Reads aligned in pairs (vs. reads aligned with mate unaligned/not present)
    • + *
    • Read length (how to handle mixed lengths?)
    • + *
    • Bad Cycles - how many machine cycles yielded combined no-call and mismatch rates of >= 80%
    • + *
    • Strand balance - reads mapped to positive strand / total mapped reads
    • + *
    + * Metrics are written for the first read of a pair, the second read, and combined for the pair. + * + * @author Doug Voet + */ +public class CollectAlignmentSummaryMetrics extends CommandLineProgram { + private static final int MAPPING_QUALITY_THRESHOLD = 20; + private static final int BASE_QUALITY_THRESHOLD = 20; + + // Usage and parameters + @Usage(programVersion="1.0") + public String USAGE = "Reads a SAM or BAM file and writes a file containing summary metrics.\n"; + @Option(shortName="I", doc="SAM or BAM file") public File INPUT; + @Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT; + @Option(shortName="R", doc="Reference sequence file") public File REFERENCE; + @Option(doc="If true (default), \"unsorted\" SAM/BAM files will be considerd coordinate sorted") + public Boolean ASSUME_COODINATE_SORTED = Boolean.TRUE; + + private ReferenceSequenceFile ref; + private ReferenceSequence refSequence; + private SAMFileHeader samFileHeader; + + /** Required main method implementation. */ + public static void main(String[] argv) { + System.exit(new CollectAlignmentSummaryMetrics().instanceMain(argv)); + } + + @Override + protected int doWork() { + IoUtil.assertFileIsReadable(INPUT); + IoUtil.assertFileIsReadable(REFERENCE); + IoUtil.assertFileIsWritable(OUTPUT); + SAMFileReader in = new SAMFileReader(INPUT); + assertCoordinateSortOrder(in); + + this.ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE); + this.samFileHeader = in.getFileHeader(); + + MetricsFile> file = collectMetrics(in.iterator()); + in.close(); + + file.write(OUTPUT); + + return 0; + } + + private void assertCoordinateSortOrder(SAMFileReader in) { + switch (in.getFileHeader().getSortOrder()) { + case coordinate: + break; + case unsorted: + if (this.ASSUME_COODINATE_SORTED) { + break; + } + default: + throw new PicardException("Cannot collect summary statistics in file " + INPUT.getAbsoluteFile() + + " because it is not sorted in coordinate order."); + } + } + + private ReferenceSequence getReference(SAMRecord record) { + while (refSequence == null || + record.getReferenceIndex(samFileHeader) > refSequence.getContigIndex()) { + + refSequence = ref.nextSequence(); + } + + if (refSequence == null || record.getReferenceIndex() != refSequence.getContigIndex()) { + throw new PicardException("Cannot find reference sequence [" + + record.getReferenceIndex() + "] in reference file"); + } + + return refSequence; + } + + /** + * Does all the work of iterating through the sam file and collecting summary alignment metrics. + */ + private MetricsFile> collectMetrics( + CloseableIterator samIterator) { + + final MetricCollector unpairedCollector = + constructCollector(Type.UNPAIRED); + final MetricCollector firstOfPairCollector = + constructCollector(Type.FIRST_OF_PAIR); + final MetricCollector secondOfPairCollector = + constructCollector(Type.SECOND_OF_PAIR); + final MetricCollector pairCollector = + constructCollector(Type.PAIR); + + while (samIterator.hasNext()) { + SAMRecord record = samIterator.next(); + + if (record.getReadPairedFlag()) { + if (record.getFirstOfPairFlag()) { + firstOfPairCollector.addRecord(record); + } else { + secondOfPairCollector.addRecord(record); + } + pairCollector.addRecord(record); + } else { + unpairedCollector.addRecord(record); + } + } + + firstOfPairCollector.onComplete(); + secondOfPairCollector.onComplete(); + pairCollector.onComplete(); + unpairedCollector.onComplete(); + + MetricsFile> file = getMetricsFile(); + file.addHeader(new StringHeader("Input file: " + INPUT.getAbsolutePath())); + file.addHeader(new StringHeader("Output file: " + OUTPUT.getAbsolutePath())); + file.addHeader(new StringHeader("Reference file: " + REFERENCE.getAbsolutePath())); + + if (firstOfPairCollector.getMetrics().TOTAL_READS > 0) { + file.addMetric(firstOfPairCollector.getMetrics()); + // override how bad cycle is determined for paired reads, it should be + // the sum of first and second reads + pairCollector.getMetrics().BAD_CYCLES = + firstOfPairCollector.getMetrics().BAD_CYCLES + + secondOfPairCollector.getMetrics().BAD_CYCLES; + file.addMetric(secondOfPairCollector.getMetrics()); + file.addMetric(pairCollector.getMetrics()); + } + if (unpairedCollector.getMetrics().TOTAL_READS > 0) { + file.addMetric(unpairedCollector.getMetrics()); + } + + return file; + } + + private MetricCollector constructCollector(Type type) { + MetricCollector collector = + new AggregateMetricCollector(new ReadCounter(), new QualityMappingCounter()); + collector.setMetrics(new AlignmentSummaryMetrics()); + collector.getMetrics().TYPE = type; + return collector; + } + + public static class AlignmentSummaryMetrics extends MetricBase { + public enum Type { UNPAIRED, FIRST_OF_PAIR, SECOND_OF_PAIR, PAIR } + public Type TYPE; + public long TOTAL_READS; + public long PF_READS; + public long PF_NOISE_READS; + public long PF_READS_ALIGNED; + public long PF_HQ_ALIGNED_READS; + public long PF_HQ_ALIGNED_BASES; + public long PF_HQ_ALIGNED_Q20_BASES; + public double PF_HQ_MEDIAN_MISMATCHES; + public double MEAN_READ_LENGTH; + public long READS_ALIGNED_IN_PAIRS; + public long BAD_CYCLES; + public double STRAND_BALANCE; + } + + /** counts reads that match various conditions */ + private class ReadCounter implements MetricCollector { + private long numPositiveStrand = 0; + private Histogram readLengthHistogram = new Histogram(); + private AlignmentSummaryMetrics metrics; + + @Override + public void addRecord(SAMRecord record) { + if (record.getNotPrimaryAlignmentFlag()) { + // only want 1 count per read so skip non primary alignments + return; + } + + metrics.TOTAL_READS++; + readLengthHistogram.increment(record.getReadBases().length); + + if (!record.getReadFailsVendorQualityCheckFlag()) { + metrics.PF_READS++; + + if (isNoiseRead(record)) { + metrics.PF_NOISE_READS++; + } + if (!record.getReadUnmappedFlag()) { + metrics.PF_READS_ALIGNED++; + } + } + + if (!record.getReadUnmappedFlag() && + record.getReadPairedFlag() && + !record.getMateUnmappedFlag()) { + metrics.READS_ALIGNED_IN_PAIRS++; + } + + if (!record.getReadNegativeStrandFlag()) { + numPositiveStrand++; + } + } + + @Override + public void onComplete() { + metrics.MEAN_READ_LENGTH = readLengthHistogram.getMean(); + metrics.STRAND_BALANCE = numPositiveStrand / (double) metrics.TOTAL_READS; + } + + private boolean isNoiseRead(SAMRecord record) { + final Object noiseAttribute = record.getAttribute(ReservedTagConstants.XN); + return (noiseAttribute != null && noiseAttribute.equals(1)); + } + + @Override + public void setMetrics(AlignmentSummaryMetrics metrics) { + this.metrics = metrics; + } + + @Override + public AlignmentSummaryMetrics getMetrics() { + return this.metrics; + } + } + + /** counts quality mappings & base calls that match various conditions */ + private class QualityMappingCounter implements MetricCollector { + private Histogram mismatchHistogram = new Histogram(); + private Histogram badCycleHistogram = new Histogram(); + private AlignmentSummaryMetrics metrics; + + @Override + public void addRecord(SAMRecord record) { + if (record.getNotPrimaryAlignmentFlag()) { + return; + } + if (record.getReadUnmappedFlag()) { + final byte[] readBases = record.getReadBases(); + for (int i = 0; i < readBases.length; i++) { + if (SequenceUtil.isNoCall(readBases[i])) { + badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); + } + } + } else { + boolean highQualityMapping = isHighQualityMapping(record); + if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++; + + final byte[] readBases = record.getReadBases(); + final byte[] refBases = getReference(record).getBases(); + final byte[] qualities = record.getBaseQualities(); + long mismatchCount = 0; + + for (AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) { + final int readIndex = alignmentBlock.getReadStart() - 1; + final int refIndex = alignmentBlock.getReferenceStart() - 1; + final int length = alignmentBlock.getLength(); + if (highQualityMapping) metrics.PF_HQ_ALIGNED_BASES += alignmentBlock.getLength(); + + for (int i=0; i= BASE_QUALITY_THRESHOLD) { + metrics.PF_HQ_ALIGNED_Q20_BASES++; + } + if (mismatch) { + mismatchCount++; + } + } + if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) { + badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); + } + } + } + mismatchHistogram.increment(mismatchCount); + } + } + + private boolean isHighQualityMapping(SAMRecord record) { + return !record.getReadFailsVendorQualityCheckFlag() && + record.getMappingQuality() >= MAPPING_QUALITY_THRESHOLD; + } + + @Override + public void onComplete() { + metrics.PF_HQ_MEDIAN_MISMATCHES = mismatchHistogram.getMedian(); + metrics.BAD_CYCLES = 0; + + for (Histogram.Bin cycleBin : badCycleHistogram.values()) { + double badCyclePercentage = cycleBin.getValue() / metrics.TOTAL_READS; + if (badCyclePercentage >= .8) { + metrics.BAD_CYCLES++; + } + } + } + + @Override + public void setMetrics(AlignmentSummaryMetrics metrics) { + this.metrics = metrics; + } + + @Override + public AlignmentSummaryMetrics getMetrics() { + return this.metrics; + } + } +} diff --git a/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java b/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java new file mode 100644 index 0000000000..c25d88cc93 --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java @@ -0,0 +1,154 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.sam; + +import java.io.File; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.metrics.MetricsFile; +import edu.mit.broad.picard.util.Histogram; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.picard.util.RExecutor; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sam.util.CloseableIterator; + +/** + * Command line program to read non-duplicate insert sizes, create a histogram + * and report distribution statistics. + * + * @author Doug Voet + */ +public class CollectInsertSizeMetrics extends CommandLineProgram { + private static Log log = Log.getInstance(CollectInsertSizeMetrics.class); + private static final String HISTOGRAM_R_SCRIPT = "edu/mit/broad/picard/sam/insertSizeHistogram.R"; + // Usage and parameters + @Usage(programVersion="1.0") + public String USAGE = "Reads a SAM or BAM file and writes a file containing metrics about " + + "the statistical distribution of insert size (excluding duplicates) " + + "and generates a histogram plot.\n"; + @Option(shortName="I", doc="SAM or BAM file") public File INPUT; + @Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT; + @Option(shortName="H", doc="File to write insert size histogram chart to") public File HISTOGRAM_FILE; + + /** Required main method implementation. */ + public static void main(String[] argv) { + System.exit(new CollectInsertSizeMetrics().instanceMain(argv)); + } + + @Override + protected int doWork() { + IoUtil.assertFileIsReadable(INPUT); + IoUtil.assertFileIsWritable(OUTPUT); + IoUtil.assertFileIsWritable(HISTOGRAM_FILE); + + SAMFileReader in = new SAMFileReader(INPUT); + MetricsFile file = collectMetrics(in.iterator()); + in.close(); + + file.write(OUTPUT); + + if (file.getMetrics().get(0).READ_PAIRS == 0) { + log.warn("Input file did not contain any records with insert size information."); + } else { + int rResult = RExecutor.executeFromClasspath( + HISTOGRAM_R_SCRIPT, + OUTPUT.getAbsolutePath(), + HISTOGRAM_FILE.getAbsolutePath(), + INPUT.getName()); + + if (rResult != 0) { + throw new PicardException("R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult); + } + } + + return 0; + } + + /** + * Does all the work of iterating through the sam file and collecting insert size metrics. + */ + MetricsFile collectMetrics(CloseableIterator samIterator) { + Histogram insertSizeHistogram = new Histogram("insert_size", "count"); + while (samIterator.hasNext()) { + SAMRecord record = samIterator.next(); + if (skipRecord(record)) { + continue; + } + + int insertSize = Math.abs(record.getInferredInsertSize()); + insertSizeHistogram.increment(insertSize); + } + + MetricsFile file = new MetricsFile(); + file.setHistogram(insertSizeHistogram); + InsertSizeMetrics metrics = new InsertSizeMetrics(); + metrics.READ_PAIRS = (long) insertSizeHistogram.getCount(); + metrics.MAX_INSERT_SIZE = (int) insertSizeHistogram.getMax(); + metrics.MIN_INSERT_SIZE = (int) insertSizeHistogram.getMin(); + metrics.MEAN_INSERT_SIZE = insertSizeHistogram.getMean(); + metrics.STANDARD_DEVIATION = insertSizeHistogram.getStandardDeviation(); + metrics.MEDIAN_INSERT_SIZE = insertSizeHistogram.getMedian(); + + final double total = insertSizeHistogram.getCount(); + final double median = insertSizeHistogram.getMedian(); + double covered = 0; + double low = median; + double high = median; + + while (low >= insertSizeHistogram.getMin() || high <= insertSizeHistogram.getMax()) { + Histogram.Bin lowBin = insertSizeHistogram.get((int) low); + if (lowBin != null) covered += lowBin.getValue(); + + if (low != high) { + Histogram.Bin highBin = insertSizeHistogram.get((int) high); + if (highBin != null) covered += highBin.getValue(); + } + + double percentCovered = covered / total; + int distance = (int) (high - low) + 1; + if (percentCovered >= 0.1 && metrics.WIDTH_OF_10_PERCENT == 0) metrics.WIDTH_OF_10_PERCENT = distance; + if (percentCovered >= 0.2 && metrics.WIDTH_OF_20_PERCENT == 0) metrics.WIDTH_OF_20_PERCENT = distance; + if (percentCovered >= 0.3 && metrics.WIDTH_OF_30_PERCENT == 0) metrics.WIDTH_OF_30_PERCENT = distance; + if (percentCovered >= 0.4 && metrics.WIDTH_OF_40_PERCENT == 0) metrics.WIDTH_OF_40_PERCENT = distance; + if (percentCovered >= 0.5 && metrics.WIDTH_OF_50_PERCENT == 0) metrics.WIDTH_OF_50_PERCENT = distance; + if (percentCovered >= 0.6 && metrics.WIDTH_OF_60_PERCENT == 0) metrics.WIDTH_OF_60_PERCENT = distance; + if (percentCovered >= 0.7 && metrics.WIDTH_OF_70_PERCENT == 0) metrics.WIDTH_OF_70_PERCENT = distance; + if (percentCovered >= 0.8 && metrics.WIDTH_OF_80_PERCENT == 0) metrics.WIDTH_OF_80_PERCENT = distance; + if (percentCovered >= 0.9 && metrics.WIDTH_OF_90_PERCENT == 0) metrics.WIDTH_OF_90_PERCENT = distance; + if (percentCovered >= 0.99 && metrics.WIDTH_OF_99_PERCENT == 0) metrics.WIDTH_OF_99_PERCENT = distance; + + --low; + ++high; + } + + file.addMetric(metrics); + + return file; + } + + /** + * Figures out whether or not the record should be included in the counting of insert sizes + */ + private boolean skipRecord(SAMRecord record) { + return !record.getReadPairedFlag() || + record.getMateUnmappedFlag() || + record.getFirstOfPairFlag() || + record.getNotPrimaryAlignmentFlag() || + record.getDuplicateReadFlag() || + record.getInferredInsertSize() == 0; + } + +} diff --git a/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java b/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java new file mode 100644 index 0000000000..819811720c --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java @@ -0,0 +1,64 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright Jan 22, 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.picard.util.PeekableIterator; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMRecord; + +import java.util.Comparator; + +/** + * Iterator for SAM records that implements comparable to enable sorting of iterators. + * The comparison is performed by comparing the next record in the iterator to the next + * record in another iterator and returning the ordering between those SAM records. + */ +class ComparableSamRecordIterator extends PeekableIterator implements Comparable { + private Comparator comparator; + private SAMFileReader reader; + + /** + * Constructs an iterator for iteration over the supplied SAM file that will be + * able to compare itself to other ComparableSAMRecordIterator instances using + * the supplied comparator for ordering SAMRecords. + * + * @param sam the SAM file to read records from + * @param comparator the Comparator to use to provide ordering fo SAMRecords + */ + public ComparableSamRecordIterator(SAMFileReader sam, Comparator comparator) { + super(sam.iterator()); + this.reader = sam; + this.comparator = comparator; + } + + /** Returns the reader from which this iterator was constructed. */ + public SAMFileReader getReader() { + return reader; + } + + /** + * Compares this iterator to another comparable iterator based on the next record + * available in each iterator. If the two comparable iterators have different + * comparator types internally an exception is thrown. + * + * @param that another iterator to compare to + * @return a negative, 0 or positive number as described in the Comparator interface + */ + public int compareTo(ComparableSamRecordIterator that) { + if (this.comparator.getClass() != that.comparator.getClass()) { + throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " + + "have different orderings internally"); + } + + SAMRecord record = this.peek(); + SAMRecord record2 = that.peek(); + return comparator.compare(record, record2); + } +} diff --git a/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java b/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java new file mode 100644 index 0000000000..01a71fd856 --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java @@ -0,0 +1,145 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.sam.SAMFileWriter; +import edu.mit.broad.sam.SAMFileWriterFactory; +import edu.mit.broad.sam.SAMFileHeader; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.PicardException; + +import java.util.List; +import java.util.ArrayList; +import java.io.File; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.math.BigInteger; + +/** + * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no + * SAMRecords, and the header contains only sequence records. + */ +public class CreateSequenceDictionary extends CommandLineProgram { + + private static final String PROGRAM_VERSION = "1.0"; + + // The following attributes define the command-line arguments + @Usage(programVersion=PROGRAM_VERSION) + public String USAGE = + "Usage: " + getClass().getName() + " [options]\n\n" + + "Read fasta or fasta.gz containing reference sequences, and write as a SAM or BAM file with only sequence dictionary.\n"; + + @Option(doc = "Input reference fasta or fasta.gz") + public File REFERENCE; + + @Option(doc = "Output SAM or BAM file containing only the sequence dictionary") + public File OUTPUT; + + @Option(doc = "Put into AS field of sequence dictionary entry if supplied", optional = true) + public String GENOME_ASSEMBLY; + + @Option(doc = "Put into UIR field of sequence dictionary entry. If not supplied, input reference file is used", + optional = true) + public String URI; + + @Option(doc = "Put into SP field of sequence dictionary entry", optional = true) + public String SPECIES; + + private final MessageDigest md5; + + public CreateSequenceDictionary() { + try { + md5 = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new PicardException("MD5 algorithm not found", e); + } + } + + public static void main(final String[] argv) { + System.exit(new CreateSequenceDictionary().instanceMain(argv)); + } + + /** + * Use reference filename to create URI to go into header if URI was not passed on cmd line. + */ + protected boolean customCommandLineValidation() { + if (URI == null) { + URI = "file:" + REFERENCE.getAbsolutePath(); + } + return true; + } + + /** + * Do the work after command line has been parsed. + * RuntimeException may be thrown by this method, and are reported appropriately. + * + * @return program exit status. + */ + protected int doWork() { + final List sequences = makeSequenceDictionary(REFERENCE); + final SAMFileHeader samHeader = new SAMFileHeader(); + samHeader.setSequences(sequences); + final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(samHeader, false, OUTPUT); + samWriter.close(); + return 0; + } + + + /** + * Read all the sequences from the given reference file, and convert into SAMSequenceRecords + * @param referenceFile fasta or fasta.gz + * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments. + */ + List makeSequenceDictionary(final File referenceFile) { + final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile); + ReferenceSequence refSeq; + final List ret = new ArrayList(); + while ((refSeq = refSeqFile.nextSequence()) != null) { + ret.add(makeSequenceRecord(refSeq)); + } + return ret; + } + + /** + * Create one SAMSequenceRecord from a single fasta sequence + */ + private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) { + final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName()); + ret.setSequenceLength(refSeq.length()); + + // Compute MD5 of upcased bases + final byte[] bases = refSeq.getBases(); + for (int i = 0; i < bases.length; ++i) { + bases[i] = (byte) (Character.toUpperCase(bases[i]) & 0xff); + } + + ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases)); + if (GENOME_ASSEMBLY != null) { + ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY); + } + ret.setAttribute(SAMSequenceRecord.URI_TAG, URI); + if (SPECIES != null) { + ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES); + } + return ret; + } + + private String md5Hash(final byte[] bytes) { + md5.reset(); + md5.update(bytes); + return new BigInteger(1, md5.digest()).toString(16); + } +} diff --git a/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java b/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java new file mode 100644 index 0000000000..689e2b806f --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java @@ -0,0 +1,116 @@ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.picard.metrics.MetricBase; +import edu.mit.broad.picard.util.Histogram; + +/** + * Metrics that are calculated during the process of marking duplicates + * within a stream of SAMRecords. + */ +public class DuplicationMetrics extends MetricBase { + /** The number of mapped reads examined which did not have a mapped mate pair. */ + public long UNPAIRED_READS_EXAMINED; + + /** The number of mapped read pairs examined. */ + public long READ_PAIRS_EXAMINED; + + /** The total number of unmapped reads examined. */ + public long UNMAPPED_READS; + + /** The number of fragments that were marked as duplicates. */ + public long UNPAIRED_READ_DUPLICATES; + + /** The number of read pairs that were marked as duplicates. */ + public long READ_PAIR_DUPLICATES; + + /** The percentage of mapped sequence that is marked as duplicate. */ + public Double PERCENT_DUPLICATION; + + /** The estimated number of unique molecules in the library based on PE duplication. */ + public Long ESTIMATED_LIBRARY_SIZE; + + /** + * Fills in the ESTIMATED_LIBRARY_SIZE based on the paired read data examined where + * possible and the PERCENT_DUPLICATION. + */ + public void calculateDerivedMetrics() { + if (READ_PAIRS_EXAMINED > 0) { + // Following code "borrowed" from CRD codebase + long n = READ_PAIRS_EXAMINED; + long c = READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES; + + double m = 1.0, M = 100.0; + + if (c >= n || f(m*c, c, n) <= 0) { + throw new IllegalStateException("Invalid values for pairs and unique pairs: " + + n + ", " + c); + + } + + while( f(M*c, c, n) >= 0 ) M *= 10.0; + + for (int i=0; i<40; i++ ) { + double r = (m+M)/2.0; + double u = f( r * c, c, n ); + if ( u == 0 ) break; + else if ( u > 0 ) m = r; + else if ( u < 0 ) M = r; + } + + this.ESTIMATED_LIBRARY_SIZE = (long) (c * (m+M)/2.0); + } + + PERCENT_DUPLICATION = (UNPAIRED_READ_DUPLICATES + READ_PAIR_DUPLICATES *2) /(double) (UNPAIRED_READS_EXAMINED + READ_PAIRS_EXAMINED *2); + } + + /** Method that is used in the computation of estimated library size. */ + private double f(double x, double c, double n) { + return c/x - 1 + Math.exp(-n/x); + } + + /** + * Estimates the ROI (return on investment) that one would see if a library was sequenced to + * x higher coverage than the observed coverage. + * + * @param estimatedLibrarySize the estimated number of molecules in the library + * @param x the multiple of sequencing to be simulated (i.e. how many X sequencing) + * @param pairs the number of pairs observed in the actual sequencing + * @param uniquePairs the number of unique pairs observed in the actual sequencing + * @return a number z <= x that estimates if you had pairs*x as your sequencing then you + * would observe uniquePairs*z unique pairs. + */ + private double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs) { + return estimatedLibrarySize * ( 1 - Math.exp(-(x*pairs)/estimatedLibrarySize) ) / uniquePairs; + } + + /** + * Calculates a histogram using the estimateRoi method to estimate the effective yield + * doing x sequencing for x=1..10. + */ + public Histogram calculateRoiHistogram() { + if (ESTIMATED_LIBRARY_SIZE == null) { + try { calculateDerivedMetrics(); } + catch (IllegalStateException ise) { return null; } + } + + long uniquePairs = READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES; + Histogram histo = new Histogram(); + + for (double x=1; x<=10; x+=1) { + histo.increment(x, estimateRoi(ESTIMATED_LIBRARY_SIZE, x, READ_PAIRS_EXAMINED, uniquePairs)); + } + + return histo; + } + + // Main method used for debugging the derived metrics +// public static void main(String[] args) { +// DuplicationMetrics m = new DuplicationMetrics(); +// m.PAIRS_EXAMINED = Integer.parseInt(args[0]); +// m.DUPLICATE_PAIRS = m.PAIRS_EXAMINED - Integer.parseInt(args[1]); +// m.calculateDerivedMetrics(); +// System.out.println("Percent Duplication: " + m.PERCENT_DUPLICATION); +// System.out.println("Est. Library Size : " + m.ESTIMATED_LIBRARY_SIZE); +// System.out.println(m.calculateRoiHistogram()); +// } +} diff --git a/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java b/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java new file mode 100644 index 0000000000..fdc9c47075 --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java @@ -0,0 +1,38 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.sam; + +import edu.mit.broad.picard.metrics.MetricBase; + +/** + * Metrics class for insert size statistics + * + * @author Doug Voet + */ +public class InsertSizeMetrics extends MetricBase { + public double MEDIAN_INSERT_SIZE; + public int MIN_INSERT_SIZE; + public int MAX_INSERT_SIZE; + public double MEAN_INSERT_SIZE; + public double STANDARD_DEVIATION; + public long READ_PAIRS; + + public int WIDTH_OF_10_PERCENT; + public int WIDTH_OF_20_PERCENT; + public int WIDTH_OF_30_PERCENT; + public int WIDTH_OF_40_PERCENT; + public int WIDTH_OF_50_PERCENT; + public int WIDTH_OF_60_PERCENT; + public int WIDTH_OF_70_PERCENT; + public int WIDTH_OF_80_PERCENT; + public int WIDTH_OF_90_PERCENT; + public int WIDTH_OF_99_PERCENT; +} diff --git a/lib/edu/mit/broad/picard/sam/MarkDuplicates.java b/lib/edu/mit/broad/picard/sam/MarkDuplicates.java new file mode 100644 index 0000000000..75321bf82d --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/MarkDuplicates.java @@ -0,0 +1,461 @@ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.metrics.MetricsFile; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.sam.util.SortingCollection; +import edu.mit.broad.sam.*; + +import java.io.*; +import java.util.*; + +/** + * A better duplication marking algorithm that handles all cases including clipped + * and gapped alignments. + * + * @author Tim Fennell + */ +public class MarkDuplicates extends CommandLineProgram { + private static final Log log = Log.getInstance(MarkDuplicates.class); + + @Usage public final String USAGE = + "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " + + "All records are then written to the output file with the duplicate records flagged."; + @Option(shortName="I", doc="The input SAM or BAM file to analyze") public File INPUT; + @Option(shortName="O", doc="The output file to right marked records to") public File OUTPUT; + @Option(shortName="M", doc="File to write duplication metrics to") public File METRICS_FILE; + + private SortingCollection pairSort; + private SortingCollection fragSort; + private long[] duplicateIndexes = new long[1000000]; + private int nextIndex = 0; // The next offset into duplicateIndexes to use + + + /** Stock main method. */ + public static void main(String[] args) { + new MarkDuplicates().instanceMain(args); + } + + /** Little struct-like class to hold read pair (and fragment) end data. */ + private static class ReadEnds { + public static final int SIZE_OF = (1*1) + (2*1) + (4*4) + (8*2) + 8; // last 8 == reference overhead + public static final byte F=0, R=1, FF=2, FR=3, RR=4, RF=5; + + short score = 0; + byte orientation; + int read1Sequence = -1; + int read1Coordinate = -1; + long read1IndexInFile = -1; + int read2Sequence = -1; + int read2Coordinate = -1; + long read2IndexInFile = -1; + + boolean isPaired() { return this.read2Sequence != -1; } + } + + /** Comparator for ReadEnds that orders by read1 position then pair orientation then read2 position. */ + private static class ReadEndsComparator implements Comparator { + public int compare(ReadEnds lhs, ReadEnds rhs) { + int retval = lhs.read1Sequence - rhs.read1Sequence; + if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate; + if (retval == 0) retval = lhs.orientation - rhs.orientation; + if (retval == 0) retval = lhs.read2Sequence - rhs.read2Sequence; + if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate; + if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile); + if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile); + + return retval; + } + } + + /** Coded for ReadEnds that just outputs the primitive fields and reads them back. */ + private static class ReadEndsCodec implements SortingCollection.Codec { + private DataInputStream in; + private DataOutputStream out; + + public SortingCollection.Codec clone() { + return new ReadEndsCodec(); + } + + public void setOutputStream(OutputStream os) { this.out = new DataOutputStream(os); } + public void setInputStream(InputStream is) { this.in = new DataInputStream(is); } + + public void encode(ReadEnds read) { + try { + this.out.writeShort(read.score); + this.out.writeByte(read.orientation); + this.out.writeInt(read.read1Sequence); + this.out.writeInt(read.read1Coordinate); + this.out.writeLong(read.read1IndexInFile); + this.out.writeInt(read.read2Sequence); + + if (read.orientation > ReadEnds.R) { + this.out.writeInt(read.read2Coordinate); + this.out.writeLong(read.read2IndexInFile); + } + this.out.flush(); + } + catch (IOException ioe) { + throw new PicardException("Exception writing ReadEnds to file.", ioe); + } + } + + public ReadEnds decode() { + ReadEnds read = new ReadEnds(); + try { + // If the first read results in an EOF we've exhausted the stream + try { read.score = this.in.readShort(); } + catch (EOFException eof) { return null; } + + read.orientation = this.in.readByte(); + read.read1Sequence = this.in.readInt(); + read.read1Coordinate = this.in.readInt(); + read.read1IndexInFile = this.in.readLong(); + read.read2Sequence = this.in.readInt(); + + if (read.orientation > ReadEnds.R) { + read.read2Coordinate = this.in.readInt(); + read.read2IndexInFile = this.in.readLong(); + } + return read; + } + catch (IOException ioe) { + throw new PicardException("Exception writing ReadEnds to file.", ioe); + } + } + } + + /** + * Main work method. Reads the BAM file once and collects sorted information about + * the 5' ends of both ends of each read (or just one end in the case of pairs). + * Then makes a pass through those determining duplicates before re-reading the + * input file and writing it out with duplication flags set correctly. + */ + protected int doWork() { + log.info("Reading input file and constructing read end information."); + buildSortedReadEndLists(); + generateDuplicateIndexes(); + log.info("Marking " + this.duplicateIndexes.length + " records as duplicates."); + DuplicationMetrics metrics = new DuplicationMetrics(); + SAMFileReader in = new SAMFileReader(INPUT); + SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), + true, + OUTPUT); + + // Now copy over the file while marking all the necessary indexes as duplicates + long recordInFileIndex = 0; + long nextDuplicateIndex = (this.duplicateIndexes.length == 0 ? -1 : this.duplicateIndexes[0]); + int arrayIndex = 1; + + for (SAMRecord rec : in) { + // First bring the simple metrics up to date + if (rec.getReadUnmappedFlag()) { + ++metrics.UNMAPPED_READS; + } + else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { + ++metrics.UNPAIRED_READS_EXAMINED; + } + else if (rec.getFirstOfPairFlag()){ + ++metrics.READ_PAIRS_EXAMINED; + } + + + if (recordInFileIndex++ == nextDuplicateIndex) { + rec.setDuplicateReadFlag(true); + + // Update the duplication metrics + if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { + ++metrics.UNPAIRED_READ_DUPLICATES; + } + else if (rec.getFirstOfPairFlag()) { + ++metrics.READ_PAIR_DUPLICATES; + } + + // Now try and figure out the next duplicate index + try { + nextDuplicateIndex = this.duplicateIndexes[arrayIndex++]; + } + catch (ArrayIndexOutOfBoundsException e) { + // Only happens once we've marked all the duplicates + nextDuplicateIndex = -1; + arrayIndex = -1; + } + } + + out.addAlignment(rec); + } + + out.close(); + + + // Write out the metrics + metrics.calculateDerivedMetrics(); + MetricsFile file = getMetricsFile(); + file.addMetric(metrics); + file.setHistogram(metrics.calculateRoiHistogram()); + file.write(METRICS_FILE); + + return 0; + } + + /** + * Goes through all the records in a file and generates a set of ReadEnds objects that + * hold the necessary information (reference sequence, 5' read coordinate) to do + * duplication, caching to disk as necssary to sort them. + */ + private void buildSortedReadEndLists() { + // TODO: take into account clipping/padding? + int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * 0.25) / ReadEnds.SIZE_OF); + this.pairSort = SortingCollection.newInstance(ReadEnds.class, + new ReadEndsCodec(), + new ReadEndsComparator(), + maxInMemory); + + this.fragSort = SortingCollection.newInstance(ReadEnds.class, + new ReadEndsCodec(), + new ReadEndsComparator(), + maxInMemory); + + Map tmp = new HashMap(); + SAMFileReader sam = new SAMFileReader(INPUT); + SAMFileHeader header = sam.getFileHeader(); + long index = 0; + + for (SAMRecord rec : sam) { + if (rec.getReadUnmappedFlag()) { + continue; + } + + ReadEnds fragmentEnd = buildReadEnds(header, index, rec); + this.fragSort.add(fragmentEnd); + + if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { + String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName(); + ReadEnds pairedEnds = tmp.remove(key); + + // See if we've already seen the first end or not + if (pairedEnds == null) { + pairedEnds = buildReadEnds(header, index, rec); + tmp.put(key, pairedEnds); + } + else { + int sequence = fragmentEnd.read1Sequence; + int coordinate = fragmentEnd.read1Coordinate; + + // If the second read is actually later, just add the second read data, else flip the reads + if (sequence > pairedEnds.read1Sequence || (sequence == pairedEnds.read1Sequence && coordinate >= pairedEnds.read1Coordinate)) { + pairedEnds.read2Sequence = sequence; + pairedEnds.read2Coordinate = coordinate; + pairedEnds.read2IndexInFile = index; + pairedEnds.orientation = getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag()); + } + else { + pairedEnds.read2Sequence = pairedEnds.read1Sequence; + pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; + pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; + pairedEnds.read1Sequence = sequence; + pairedEnds.read1Coordinate = coordinate; + pairedEnds.read1IndexInFile = index; + pairedEnds.orientation = getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R); + } + + pairedEnds.score += getScore(rec); + this.pairSort.add(pairedEnds); + } + } + + ++index; + } + } + + /** Builds a read ends object that represents a single read. */ + private ReadEnds buildReadEnds(SAMFileHeader header, long index, SAMRecord rec) { + ReadEnds ends = new ReadEnds(); + ends.read1Sequence = rec.getReferenceIndex(header); + ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart(); + ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F; + ends.read1IndexInFile = index; + ends.score = getScore(rec); + + // Doing this lets the ends object know that it's part of a pair + if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { + ends.read2Sequence = rec.getMateReferenceIndex(header); + } + + return ends; + } + + /** + * Returns a single byte that encodes the orientation of the two reads in a pair. + */ + private byte getOrientationByte(boolean read1NegativeStrand, boolean read2NegativeStrand) { + if (read1NegativeStrand) { + if (read2NegativeStrand) return ReadEnds.RR; + else return ReadEnds.RF; + } + else { + if (read2NegativeStrand) return ReadEnds.FR; + else return ReadEnds.FF; + } + } + + + + /** Calculates a score for the read which is the sum of scores over Q20. */ + private short getScore(SAMRecord rec) { + short score = 0; + for (byte b : rec.getBaseQualities()) { + if (b >= 15) score += b; + } + + return score; + } + + /** + * Goes through the accumulated ReadEnds objects and determines which of them are + * to be marked as duplicates. + * + * @return an array with an ordered list of indexes into the source file + */ + private void generateDuplicateIndexes() { + ReadEnds firstOfNextChunk = null; + List nextChunk = new ArrayList(200); + + // First just do the pairs + log.info("Traversing read pair information and detecting duplicates."); + for (ReadEnds next : this.pairSort) { + if (firstOfNextChunk == null) { + firstOfNextChunk = next; + nextChunk.add(firstOfNextChunk); + } + else if (areComparableForDuplicates(firstOfNextChunk, next, true)) { + nextChunk.add(next); + } + else { + if (nextChunk.size() > 1) { + markDuplicatePairs(nextChunk); + } + + nextChunk.clear(); + nextChunk.add(next); + firstOfNextChunk = next; + } + } + markDuplicatePairs(nextChunk); + this.pairSort = null; + + // Now deal with the fragments + log.info("Traversing fragment information and detecting duplicates."); + boolean containsPairs = false; + boolean containsFrags = false; + + for (ReadEnds next : this.fragSort) { + if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) { + nextChunk.add(next); + containsPairs = containsPairs || next.isPaired(); + containsFrags = containsFrags || !next.isPaired(); + } + else { + if (nextChunk.size() > 1 && containsFrags) { + markDuplicateFragments(nextChunk, containsPairs); + } + + nextChunk.clear(); + nextChunk.add(next); + firstOfNextChunk = next; + containsPairs = next.isPaired(); + containsFrags = !next.isPaired(); + } + } + markDuplicateFragments(nextChunk, containsPairs); + this.fragSort = null; + + // Now shrink down the array and sort it + log.info("Sorting list of duplicate records."); + long[] tmp = new long[this.nextIndex]; + System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); + this.duplicateIndexes = tmp; + Arrays.sort(this.duplicateIndexes); + } + + private boolean areComparableForDuplicates(final ReadEnds lhs, final ReadEnds rhs, final boolean compareRead2) { + boolean retval = lhs.read1Sequence == rhs.read1Sequence && + lhs.read1Coordinate == rhs.read1Coordinate && + lhs.orientation == rhs.orientation; + + if (compareRead2) { + retval = lhs.read2Sequence == rhs.read2Sequence && + lhs.read2Coordinate == rhs.read2Coordinate; + } + + return retval; + } + + private void addIndexAsDuplicate(final long bamIndex) { + if (this.nextIndex > this.duplicateIndexes.length - 1) { + long[] tmp = new long[this.duplicateIndexes.length * 2]; + System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); + this.duplicateIndexes = tmp; + } + + this.duplicateIndexes[this.nextIndex++] = bamIndex; + } + + /** + * Takes a list of ReadEnds objects and removes from it all objects that should + * not be marked as duplicates. + * + * @param list + */ + private void markDuplicatePairs(final List list) { + short maxScore = 0; + ReadEnds best = null; + + for (final ReadEnds end : list) { + if (end.score > maxScore || best == null) { + maxScore = end.score; + best = end; + } + } + + for (final ReadEnds end : list) { + if (end != best) { + addIndexAsDuplicate(end.read1IndexInFile); + addIndexAsDuplicate(end.read2IndexInFile); + } + } + } + + /** + * Takes a list of ReadEnds objects and removes from it all objects that should + * not be marked as duplicates. + * + * @param list + */ + private void markDuplicateFragments(final List list, final boolean containsPairs) { + if (containsPairs) { + for (final ReadEnds end : list) { + if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile); + } + } + else { + short maxScore = 0; + ReadEnds best = null; + for (final ReadEnds end : list) { + if (end.score > maxScore || best == null) { + maxScore = end.score; + best = end; + } + } + + for (final ReadEnds end : list) { + if (end != best) { + addIndexAsDuplicate(end.read1IndexInFile); + } + } + } + } +} diff --git a/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java b/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java new file mode 100644 index 0000000000..908f27f7d9 --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java @@ -0,0 +1,461 @@ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.metrics.MetricsFile; +import edu.mit.broad.picard.util.Log; +import edu.mit.broad.sam.util.SortingCollection; +import edu.mit.broad.sam.*; + +import java.io.*; +import java.util.*; + +/** + * A better duplication marking algorithm that handles all cases including clipped + * and gapped alignments. + * + * @author Tim Fennell + */ +public class MarkDuplicates2 extends CommandLineProgram { + private static final Log log = Log.getInstance(MarkDuplicates2.class); + + @Usage public final String USAGE = + "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " + + "All records are then written to the output file with the duplicate records flagged."; + @Option(shortName="I", doc="The input SAM or BAM file to analyze") public File INPUT; + @Option(shortName="O", doc="The output file to right marked records to") public File OUTPUT; + @Option(shortName="M", doc="File to write duplication metrics to") public File METRICS_FILE; + + private SortingCollection pairSort; + private SortingCollection fragSort; + private long[] duplicateIndexes = new long[1000000]; + private int nextIndex = 0; // The next offset into duplicateIndexes to use + + + /** Stock main method. */ + public static void main(String[] args) { + new MarkDuplicates2().instanceMain(args); + } + + /** Little struct-like class to hold read pair (and fragment) end data. */ + private static class ReadEnds { + public static final int SIZE_OF = (1*1) + (2*1) + (4*4) + (8*2) + 8; // last 8 == reference overhead + public static final byte F=0, R=1, FF=2, FR=3, RR=4, RF=5; + + short score = 0; + byte orientation; + int read1Sequence = -1; + int read1Coordinate = -1; + long read1IndexInFile = -1; + int read2Sequence = -1; + int read2Coordinate = -1; + long read2IndexInFile = -1; + + boolean isPaired() { return this.read2Sequence != -1; } + } + + /** Comparator for ReadEnds that orders by read1 position then pair orientation then read2 position. */ + private static class ReadEndsComparator implements Comparator { + public int compare(ReadEnds lhs, ReadEnds rhs) { + int retval = lhs.read1Sequence - rhs.read1Sequence; + if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate; + if (retval == 0) retval = lhs.orientation - rhs.orientation; + if (retval == 0) retval = lhs.read2Sequence - rhs.read2Sequence; + if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate; + if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile); + if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile); + + return retval; + } + } + + /** Coded for ReadEnds that just outputs the primitive fields and reads them back. */ + private static class ReadEndsCodec implements SortingCollection.Codec { + private DataInputStream in; + private DataOutputStream out; + + public SortingCollection.Codec clone() { + return new ReadEndsCodec(); + } + + public void setOutputStream(OutputStream os) { this.out = new DataOutputStream(os); } + public void setInputStream(InputStream is) { this.in = new DataInputStream(is); } + + public void encode(ReadEnds read) { + try { + this.out.writeShort(read.score); + this.out.writeByte(read.orientation); + this.out.writeInt(read.read1Sequence); + this.out.writeInt(read.read1Coordinate); + this.out.writeLong(read.read1IndexInFile); + this.out.writeInt(read.read2Sequence); + + if (read.orientation > ReadEnds.R) { + this.out.writeInt(read.read2Coordinate); + this.out.writeLong(read.read2IndexInFile); + } + this.out.flush(); + } + catch (IOException ioe) { + throw new PicardException("Exception writing ReadEnds to file.", ioe); + } + } + + public ReadEnds decode() { + ReadEnds read = new ReadEnds(); + try { + // If the first read results in an EOF we've exhausted the stream + try { read.score = this.in.readShort(); } + catch (EOFException eof) { return null; } + + read.orientation = this.in.readByte(); + read.read1Sequence = this.in.readInt(); + read.read1Coordinate = this.in.readInt(); + read.read1IndexInFile = this.in.readLong(); + read.read2Sequence = this.in.readInt(); + + if (read.orientation > ReadEnds.R) { + read.read2Coordinate = this.in.readInt(); + read.read2IndexInFile = this.in.readLong(); + } + return read; + } + catch (IOException ioe) { + throw new PicardException("Exception writing ReadEnds to file.", ioe); + } + } + } + + /** + * Main work method. Reads the BAM file once and collects sorted information about + * the 5' ends of both ends of each read (or just one end in the case of pairs). + * Then makes a pass through those determining duplicates before re-reading the + * input file and writing it out with duplication flags set correctly. + */ + protected int doWork() { + log.info("Reading input file and constructing read end information."); + buildSortedReadEndLists(); + generateDuplicateIndexes(); + log.info("Marking " + this.duplicateIndexes.length + " records as duplicates."); + DuplicationMetrics metrics = new DuplicationMetrics(); + SAMFileReader in = new SAMFileReader(INPUT); + SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), + true, + OUTPUT); + + // Now copy over the file while marking all the necessary indexes as duplicates + long recordInFileIndex = 0; + long nextDuplicateIndex = (this.duplicateIndexes.length == 0 ? -1 : this.duplicateIndexes[0]); + int arrayIndex = 1; + + for (SAMRecord rec : in) { + // First bring the simple metrics up to date + if (rec.getReadUnmappedFlag()) { + ++metrics.UNMAPPED_READS; + } + else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { + ++metrics.UNPAIRED_READS_EXAMINED; + } + else if (rec.getFirstOfPairFlag()){ + ++metrics.READ_PAIRS_EXAMINED; + } + + + if (recordInFileIndex++ == nextDuplicateIndex) { + rec.setDuplicateReadFlag(true); + + // Update the duplication metrics + if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { + ++metrics.UNPAIRED_READ_DUPLICATES; + } + else if (rec.getFirstOfPairFlag()) { + ++metrics.READ_PAIR_DUPLICATES; + } + + // Now try and figure out the next duplicate index + try { + nextDuplicateIndex = this.duplicateIndexes[arrayIndex++]; + } + catch (ArrayIndexOutOfBoundsException e) { + // Only happens once we've marked all the duplicates + nextDuplicateIndex = -1; + arrayIndex = -1; + } + } + + out.addAlignment(rec); + } + + out.close(); + + + // Write out the metrics + metrics.calculateDerivedMetrics(); + MetricsFile file = getMetricsFile(); + file.addMetric(metrics); + file.setHistogram(metrics.calculateRoiHistogram()); + file.write(METRICS_FILE); + + return 0; + } + + /** + * Goes through all the records in a file and generates a set of ReadEnds objects that + * hold the necessary information (reference sequence, 5' read coordinate) to do + * duplication, caching to disk as necssary to sort them. + */ + private void buildSortedReadEndLists() { + // TODO: take into account clipping/padding? + int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * 0.25) / ReadEnds.SIZE_OF); + this.pairSort = SortingCollection.newInstance(ReadEnds.class, + new ReadEndsCodec(), + new ReadEndsComparator(), + maxInMemory); + + this.fragSort = SortingCollection.newInstance(ReadEnds.class, + new ReadEndsCodec(), + new ReadEndsComparator(), + maxInMemory); + + Map tmp = new HashMap(); + SAMFileReader sam = new SAMFileReader(INPUT); + SAMFileHeader header = sam.getFileHeader(); + long index = 0; + + for (SAMRecord rec : sam) { + if (rec.getReadUnmappedFlag()) { + continue; + } + + ReadEnds fragmentEnd = buildReadEnds(header, index, rec); + this.fragSort.add(fragmentEnd); + + if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { + String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName(); + ReadEnds pairedEnds = tmp.remove(key); + + // See if we've already seen the first end or not + if (pairedEnds == null) { + pairedEnds = buildReadEnds(header, index, rec); + tmp.put(key, pairedEnds); + } + else { + int sequence = fragmentEnd.read1Sequence; + int coordinate = fragmentEnd.read1Coordinate; + + // If the second read is actually later, just add the second read data, else flip the reads + if (sequence > pairedEnds.read1Sequence || (sequence == pairedEnds.read1Sequence && coordinate >= pairedEnds.read1Coordinate)) { + pairedEnds.read2Sequence = sequence; + pairedEnds.read2Coordinate = coordinate; + pairedEnds.read2IndexInFile = index; + pairedEnds.orientation = getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag()); + } + else { + pairedEnds.read2Sequence = pairedEnds.read1Sequence; + pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; + pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; + pairedEnds.read1Sequence = sequence; + pairedEnds.read1Coordinate = coordinate; + pairedEnds.read1IndexInFile = index; + pairedEnds.orientation = getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R); + } + + pairedEnds.score += getScore(rec); + this.pairSort.add(pairedEnds); + } + } + + ++index; + } + } + + /** Builds a read ends object that represents a single read. */ + private ReadEnds buildReadEnds(SAMFileHeader header, long index, SAMRecord rec) { + ReadEnds ends = new ReadEnds(); + ends.read1Sequence = rec.getReferenceIndex(header); + ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart(); + ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F; + ends.read1IndexInFile = index; + ends.score = getScore(rec); + + // Doing this lets the ends object know that it's part of a pair + if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { + ends.read2Sequence = rec.getMateReferenceIndex(header); + } + + return ends; + } + + /** + * Returns a single byte that encodes the orientation of the two reads in a pair. + */ + private byte getOrientationByte(boolean read1NegativeStrand, boolean read2NegativeStrand) { + if (read1NegativeStrand) { + if (read2NegativeStrand) return ReadEnds.RR; + else return ReadEnds.RF; + } + else { + if (read2NegativeStrand) return ReadEnds.FR; + else return ReadEnds.FF; + } + } + + + + /** Calculates a score for the read which is the sum of scores over Q20. */ + private short getScore(SAMRecord rec) { + short score = 0; + for (byte b : rec.getBaseQualities()) { + if (b >= 15) score += b; + } + + return score; + } + + /** + * Goes through the accumulated ReadEnds objects and determines which of them are + * to be marked as duplicates. + * + * @return an array with an ordered list of indexes into the source file + */ + private void generateDuplicateIndexes() { + ReadEnds firstOfNextChunk = null; + List nextChunk = new ArrayList(200); + + // First just do the pairs + log.info("Traversing read pair information and detecting duplicates."); + for (ReadEnds next : this.pairSort) { + if (firstOfNextChunk == null) { + firstOfNextChunk = next; + nextChunk.add(firstOfNextChunk); + } + else if (areComparableForDuplicates(firstOfNextChunk, next, true)) { + nextChunk.add(next); + } + else { + if (nextChunk.size() > 1) { + markDuplicatePairs(nextChunk); + } + + nextChunk.clear(); + nextChunk.add(next); + firstOfNextChunk = next; + } + } + markDuplicatePairs(nextChunk); + this.pairSort = null; + + // Now deal with the fragments + log.info("Traversing fragment information and detecting duplicates."); + boolean containsPairs = false; + boolean containsFrags = false; + + for (ReadEnds next : this.fragSort) { + if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) { + nextChunk.add(next); + containsPairs = containsPairs || next.isPaired(); + containsFrags = containsFrags || !next.isPaired(); + } + else { + if (nextChunk.size() > 1 && containsFrags) { + markDuplicateFragments(nextChunk, containsPairs); + } + + nextChunk.clear(); + nextChunk.add(next); + firstOfNextChunk = next; + containsPairs = next.isPaired(); + containsFrags = !next.isPaired(); + } + } + markDuplicateFragments(nextChunk, containsPairs); + this.fragSort = null; + + // Now shrink down the array and sort it + log.info("Sorting list of duplicate records."); + long[] tmp = new long[this.nextIndex]; + System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); + this.duplicateIndexes = tmp; + Arrays.sort(this.duplicateIndexes); + } + + private boolean areComparableForDuplicates(final ReadEnds lhs, final ReadEnds rhs, final boolean compareRead2) { + boolean retval = lhs.read1Sequence == rhs.read1Sequence && + lhs.read1Coordinate == rhs.read1Coordinate && + lhs.orientation == rhs.orientation; + + if (compareRead2) { + retval = lhs.read2Sequence == rhs.read2Sequence && + lhs.read2Coordinate == rhs.read2Coordinate; + } + + return retval; + } + + private void addIndexAsDuplicate(final long bamIndex) { + if (this.nextIndex > this.duplicateIndexes.length - 1) { + long[] tmp = new long[this.duplicateIndexes.length * 2]; + System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); + this.duplicateIndexes = tmp; + } + + this.duplicateIndexes[this.nextIndex++] = bamIndex; + } + + /** + * Takes a list of ReadEnds objects and removes from it all objects that should + * not be marked as duplicates. + * + * @param list + */ + private void markDuplicatePairs(final List list) { + short maxScore = 0; + ReadEnds best = null; + + for (final ReadEnds end : list) { + if (end.score > maxScore || best == null) { + maxScore = end.score; + best = end; + } + } + + for (final ReadEnds end : list) { + if (end != best) { + addIndexAsDuplicate(end.read1IndexInFile); + addIndexAsDuplicate(end.read2IndexInFile); + } + } + } + + /** + * Takes a list of ReadEnds objects and removes from it all objects that should + * not be marked as duplicates. + * + * @param list + */ + private void markDuplicateFragments(final List list, final boolean containsPairs) { + if (containsPairs) { + for (final ReadEnds end : list) { + if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile); + } + } + else { + short maxScore = 0; + ReadEnds best = null; + for (final ReadEnds end : list) { + if (end.score > maxScore || best == null) { + maxScore = end.score; + best = end; + } + } + + for (final ReadEnds end : list) { + if (end != best) { + addIndexAsDuplicate(end.read1IndexInFile); + } + } + } + } +} diff --git a/lib/edu/mit/broad/picard/sam/MergeSamFiles.java b/lib/edu/mit/broad/picard/sam/MergeSamFiles.java new file mode 100644 index 0000000000..cae476956b --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/MergeSamFiles.java @@ -0,0 +1,95 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. +* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or +* functionality. +*/ +package edu.mit.broad.picard.sam; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.sam.SAMFileHeader; +import static edu.mit.broad.sam.SAMFileHeader.SortOrder; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMFileWriter; +import edu.mit.broad.sam.SAMFileWriterFactory; +import edu.mit.broad.sam.SAMRecord; + +/** + * Reads a SAM or BAM file and combines the output to one file + * + * @author Dave Tefft + */ +public class MergeSamFiles extends CommandLineProgram { + // Usage and parameters + @Usage(programVersion="1.0") + public String USAGE = "Merges multiple SAM/BAM files into one file.\n"; + + @Option(shortName="I", doc="SAM or BAM input file", minElements=1) + public List INPUT = new ArrayList(); + + @Option(shortName="O", doc="SAM or BAM file to write merged result to") + public File OUTPUT; + + @Option(shortName="SO", doc="Sort order of output file", optional=true) + public SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; + + /** Required main method implementation. */ + public static void main(String[] argv) { + System.exit(new MergeSamFiles().instanceMain(argv)); + } + + /** Combines multiple SAM/BAM files into one. */ + @Override + protected int doWork() { + boolean matchedSortOrders = true; + + // Open the files for reading and writing + List readers = new ArrayList(); + for (File inFile : INPUT) { + IoUtil.assertFileIsReadable(inFile); + SAMFileReader in = new SAMFileReader(inFile); + readers.add(in); + matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER; + } + + // If all the input sort orders match the output sort order then just merge them and + // write on the fly, otherwise setup to merge and sort before writing out the final file + IoUtil.assertFileIsWritable(OUTPUT); + MergingSamRecordIterator iterator = null; + SAMFileWriter out = null; + + if (matchedSortOrders || SORT_ORDER == SortOrder.unsorted) { + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); + iterator = new MergingSamRecordIterator(headerMerger); + out = new SAMFileWriterFactory().makeSAMOrBAMWriter(headerMerger.getMergedHeader(), true, OUTPUT); + } + else { + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SortOrder.unsorted); + iterator = new MergingSamRecordIterator(headerMerger); + SAMFileHeader header = headerMerger.getMergedHeader(); + header.setSortOrder(SORT_ORDER); + out = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT); + } + + // Lastly loop through and write out the records + while (iterator.hasNext()) { + SAMRecord record = iterator.next(); + out.addAlignment(record); + } + + out.close(); + return 0; + } + +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java b/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java new file mode 100644 index 0000000000..5641512af1 --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java @@ -0,0 +1,136 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. +* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or +* functionality. +*/ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.sam.*; +import static edu.mit.broad.sam.SAMFileHeader.SortOrder; +import edu.mit.broad.picard.PicardException; + +import java.util.*; +import java.lang.reflect.Constructor; + +/** + * Provides an iterator interface for merging multiple underlying iterators into a single + * iterable stream. The underlying iterators/files must all have the same sort order unless + * the requested output format is unsorted, in which case any combination is valid. + */ +public class MergingSamRecordIterator implements Iterator { + private final PriorityQueue pq; + private final SamFileHeaderMerger samHeaderMerger; + private final SAMFileHeader.SortOrder sortOrder; + + /** + * Constructs a new merging iterator with the same set of readers and sort order as + * provided by the header merger parameter. + */ + public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger) { + this.samHeaderMerger = headerMerger; + this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); + final SAMRecordComparator comparator = getComparator(); + + final Collection readers = headerMerger.getReaders(); + this.pq = new PriorityQueue(readers.size()); + + for (final SAMFileReader reader : readers) { + if (this.sortOrder != SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder){ + throw new PicardException("Files are not compatible with sort order"); + } + + final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, comparator); + addIfNotEmpty(iterator); + } + } + + /** Returns true if any of the underlying iterators has more records, otherwise false. */ + public boolean hasNext() { + return !this.pq.isEmpty(); + } + + /** Returns the next record from the top most iterator during merging. */ + public SAMRecord next() { + final ComparableSamRecordIterator iterator = this.pq.poll(); + final SAMRecord record = iterator.next(); + addIfNotEmpty(iterator); + + if (this.samHeaderMerger.hasGroupIdDuplicates()) { + final String id = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID); + final String newId = this.samHeaderMerger.getReadGroupId(iterator.getReader(), id); + record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newId); + } + final String oldProgramGroupId = (String) record.getAttribute(SAMTag.PG.toString()); + if (oldProgramGroupId != null) { + final String newProgramGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader(), oldProgramGroupId); + record.setAttribute(SAMTag.PG.toString(), newProgramGroupId); + } + + return record; + } + + /** + * Adds iterator to priority queue. If the iterator has more records it is added + * otherwise it is closed and not added. + */ + private void addIfNotEmpty(final ComparableSamRecordIterator iterator) { + if (iterator.hasNext()) { + pq.offer(iterator); + } + else { + iterator.close(); + } + } + + /** Unsupported operation. */ + public void remove() { + throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()"); + } + + /** + * Get the right comparator for a given sort order (coordinate, alphabetic). In the + * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive + * ordering. + */ + private SAMRecordComparator getComparator() { + // For unsorted build a fake comparator that compares based on object ID + if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { + return new SAMRecordComparator() { + public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { + return System.identityHashCode(lhs) - System.identityHashCode(rhs); + } + + public int compare(final SAMRecord lhs, final SAMRecord rhs) { + return fileOrderCompare(lhs, rhs); + } + }; + } + + // Otherwise try and figure out what kind of comparator to return and build it + final Class type = this.sortOrder.getComparator(); + + try { + final Constructor ctor = type.getConstructor(SAMFileHeader.class); + return ctor.newInstance(this.samHeaderMerger.getMergedHeader()); + } + catch (Exception e) { + try { + final Constructor ctor = type.getConstructor(); + return ctor.newInstance(); + } + catch (Exception e2) { + throw new PicardException("Could not instantiate a comparator for sort order: " + this.sortOrder, e2); + } + } + } + + /** Returns the merged header that the merging iterator is working from. */ + public SAMFileHeader getMergedHeader() { + return this.samHeaderMerger.getMergedHeader(); + } +} diff --git a/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java b/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java new file mode 100644 index 0000000000..2f4d3ef91d --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java @@ -0,0 +1,18 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.sam; + +/** + * Constants for tags used in our SAM/BAM files + */ +public class ReservedTagConstants { + public static final String READ_GROUP_ID = "RG"; // Specified in the SAM spec doc + public static final String XN = "XN"; // Present and set to 1 if a read is a noise read +} diff --git a/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java b/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java new file mode 100644 index 0000000000..6c69678ad6 --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java @@ -0,0 +1,286 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. +* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or +* functionality. +*/ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.sam.*; +import edu.mit.broad.picard.PicardException; + +import java.util.*; + +/** + * Merges SAMFileHeaders that have the same sequences into a single merged header + * object while providing read group translation for cases where read groups + * clash across input headers. + * + * @author Dave Tefft + */ +public class SamFileHeaderMerger { + //Super Header to construct + private final SAMFileHeader mergedHeader; + private final Collection readers; + + //Translation of old group ids to new group ids + private final Map> samGroupIdTranslation = + new HashMap>(); + + //the groups from different files use the same group ids + private boolean hasGroupIdDuplicates = false; + + //Translation of old program group ids to new program group ids + private final Map> samProgramGroupIdTranslation = + new HashMap>(); + + //Letters to construct new ids from a counter + private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + + /** + * Create SAMFileHeader with additional information + * + * @param readers same file readers to combine + * @param sortOrder sort order new header should have + */ + public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { + this.readers = readers; + this.mergedHeader = new SAMFileHeader(); + + // Set sequences first because if it throws exception there is no need to continue + final List sequences = getSAMSequences(readers); + this.mergedHeader.setSequences(sequences); + + // Set program that creates input alignments + for (final SAMProgramRecord program : mergeSAMProgramRecordLists(readers)) { + this.mergedHeader.addProgramRecord(program); + } + + // Set read groups for merged header + final List readGroups = getReadGroups(readers); + this.mergedHeader.setReadGroups(readGroups); + this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); + + this.mergedHeader.setSortOrder(sortOrder); + } + + /** + * Checks to see if there are clashes where different readers are using the same read + * group IDs. If they are then a new set of unique read group IDs are generated (across all + * read groups) otherwise the original read group headers are returned. + * + * @param readers readers to combine + * @return new list of readgroups constructed from all the readers + */ + private List getReadGroups(final Collection readers) { + // Read groups as read from the readers + final List orginalReadGroups = new ArrayList(); + + // Read group with new ids that don't confict + final List modifiedReadGroups = new ArrayList(); + + //set to see if there are duplicate group ids and whether or not we need to modify them + final Set groupIdsSeenBefore = new HashSet(); + + int x = 0; + this.hasGroupIdDuplicates = false; + + for (final SAMFileReader reader : readers) { + final SAMFileHeader header = reader.getFileHeader(); + final Map idTranslation = new HashMap(); + + // Iterate over read groups to find conflicting ids + for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { + final String groupId = readGroup.getReadGroupId(); + final String newGroupId = createNewId(x++); + + // Check to see if same group id is used in two different readers + if (groupIdsSeenBefore.contains(groupId)) { + hasGroupIdDuplicates = true; + } + groupIdsSeenBefore.add(groupId); + + // Creates a new read group with the new id and copies all it's attributes + final SAMReadGroupRecord groupRecordWithNewId = copyReadGroup(readGroup, newGroupId); + + orginalReadGroups.add(readGroup); + modifiedReadGroups.add(groupRecordWithNewId); + + idTranslation.put(groupId, newGroupId); + } + + // Add id tranlation for updating SamRecords with new ids if neccessary + this.samGroupIdTranslation.put(reader, idTranslation); + } + + // return approriate readgroups whether or not the new ids have to be used + if (this.hasGroupIdDuplicates) { + return modifiedReadGroups; + } + else { + return orginalReadGroups; + } + } + + /** + * Get the sequences off the SAMFileReader header. Throws runtime exception if the sequence + * are different from one another + * + * @param readers readers to pull sequences from + * @return sequences from files. Each file should have the same sequence + */ + private List getSAMSequences(final Collection readers) { + List sequences = null; + for (final SAMFileReader reader : readers) { + final SAMFileHeader header = reader.getFileHeader(); + + if (sequences == null) { + sequences = header.getSequences(); + } + else { + final List currentSequences = header.getSequences(); + if (!sequenceListsEqual(sequences, currentSequences)) { + throw new PicardException("Files are not compatible with each other. They can not be combined"); + } + } + } + return sequences; + } + + /** + * Checks the equality of two lists of sequence records using the isSameSequence + * method instead of the equals method which is a more strict identity check. + * @param s1 a list of sequence headers + * @param s2 a second list of sequence headers + * @return true if the two lists match otherwise false + */ + private boolean sequenceListsEqual(final List s1, final List s2) { + if (s1.size() != s2.size()) { + return false; + } + for (int i = 0; i < s1.size(); ++i) { + if (!s1.get(i).isSameSequence(s2.get(i))) { + return false; + } + } + return true; + } + + /** + * Find the alignment program that produced the readers. If there are more than one + * generate a new program represents that + * + * @param readers SAMFileReaders to pull program information from + * @return SAMProgram record that represents all the readers + */ + // TODO: this needs to be fixed up to support multiple program records (PIC-15) + private List mergeSAMProgramRecordLists(final Collection readers) { + final boolean programMixed = false; + final List ret = new ArrayList(); + int nextProgramGroupId = 0; + for (final SAMFileReader reader : readers) { + final SAMFileHeader header = reader.getFileHeader(); + final Map idTranslation = new HashMap(); + for (final SAMProgramRecord oldProgramRecord : header.getProgramRecords()) { + boolean foundMatch = false; + for (final SAMProgramRecord newProgramRecord : ret) { + if (newProgramRecord.equivalent(oldProgramRecord)) { + idTranslation.put(oldProgramRecord.getProgramGroupId(), newProgramRecord.getProgramGroupId()); + foundMatch = true; + break; + } + } + if (!foundMatch) { + final SAMProgramRecord newProgramRecord = new SAMProgramRecord(Integer.toString(nextProgramGroupId++)); + copyProgramGroupAttributes(oldProgramRecord, newProgramRecord); + ret.add(newProgramRecord); + idTranslation.put(oldProgramRecord.getProgramGroupId(), newProgramRecord.getProgramGroupId()); + } + } + samProgramGroupIdTranslation.put(reader, idTranslation); + } + return ret; + } + + private void copyProgramGroupAttributes(final SAMProgramRecord oldProgramRecord, final SAMProgramRecord newProgramRecord) { + for (final Map.Entry entry : oldProgramRecord.getAttributes()) { + newProgramRecord.setAttribute(entry.getKey(), entry.getValue()); + } + } + + + /** + * Copies all the attribute of a readgroup to a new readgroup with a new id + * + * @param readGroup the group to be copied + * @param modifiedId the id for the new readgroup + * @return new read group + */ + private SAMReadGroupRecord copyReadGroup(final SAMReadGroupRecord readGroup, final String modifiedId) { + final SAMReadGroupRecord retval = new SAMReadGroupRecord(modifiedId); + retval.setLibrary(readGroup.getLibrary()); + retval.setSample(readGroup.getSample()); + + for (final Map.Entry attr : readGroup.getAttributes()) { + retval.setAttribute(attr.getKey(), attr.getValue()); + } + + return retval; + } + + + /** + * Creates a base 26 representation of an int + * + * @param n int to covert to letter representation + * @return string rep for an int eg 0 = A 27 = AB + */ + protected static String createNewId(int n) { + final int base = ALPHABET.length(); + + String s = ""; + while (true) { + final int r = n % base; + s = ALPHABET.charAt(r) + s; + n = n / base; + if (n == 0) { + return s; + } + n -= 1; + } + } + + /** Returns the read group id that should be used for the input read and RG id. */ + public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { + return this.samGroupIdTranslation.get(reader).get(originalReadGroupId); + } + + /** + * @param reader one of the input files + * @param originalProgramGroupId a program group ID from the above input file + * @return new ID from the merged list of program groups in the output file + */ + public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) { + return this.samProgramGroupIdTranslation.get(reader).get(originalProgramGroupId); + } + + /** Returns true if there are read group duplicates within the merged headers. */ + public boolean hasGroupIdDuplicates() { + return this.hasGroupIdDuplicates; + } + + /** Returns the merged header that should be written to any output merged file. */ + public SAMFileHeader getMergedHeader() { + return this.mergedHeader; + } + + /** Returns the collection of readers that this header merger is working with. */ + public Collection getReaders() { + return this.readers; + } +} diff --git a/lib/edu/mit/broad/picard/sam/SamLocusIterator.java b/lib/edu/mit/broad/picard/sam/SamLocusIterator.java new file mode 100644 index 0000000000..f7a52ae909 --- /dev/null +++ b/lib/edu/mit/broad/picard/sam/SamLocusIterator.java @@ -0,0 +1,280 @@ +package edu.mit.broad.picard.sam; + +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sam.NotPrimarySkippingIterator; +import edu.mit.broad.picard.directed.GenomeMask; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class SamLocusIterator implements Iterable, CloseableIterator { + + /** + * The unit of iteration. Holds the locus, plus the base, quality and strand for each read at that locus. + */ + public static class LocusInfo { + protected final int sequenceIndex; + protected final int position; + protected final List bases = new ArrayList(100); + protected final List qualities = new ArrayList(100); + protected final List negativeStrandFlags = new ArrayList(100); + + LocusInfo(final int sequenceIndex, final int position) { + this.sequenceIndex = sequenceIndex; + this.position = position; + } + + /** + * Accumulate info for one read at the locus. + */ + public void add(final Byte readBase, final Byte baseQuality, final boolean strand) { + bases.add(readBase); + qualities.add(baseQuality); + negativeStrandFlags.add(strand); + } + + public int getSequenceIndex() { return sequenceIndex; } + public int getPosition() { return position; } + public List getBases() { return bases; } + public List getQualities() { return qualities; } + public List getNegativeStrandFlags() { return negativeStrandFlags; } + + public String getBasesAsString() { return bytesToString(bases); } + + private static String bytesToString(final List data) { + if (data == null || data.size() == 0) { + return ""; + } + + final char[] chars = new char[data.size()]; + for (int i = 0; i < data.size(); i++) { + chars[i] = (char) (data.get(i) & 0xFF); + } + return new String(chars); + } + } + + + + + private final CloseableIterator underlyingIterator; + private final NotPrimarySkippingIterator it; + private final LinkedList complete = new LinkedList(); + private final LinkedList accumulator = new LinkedList(); + + private boolean includeNonPfReads = false; + private boolean includeDuplicates = false; + private int qualityScoreCutoff = -Integer.MAX_VALUE; + + private GenomeMask mask; + private int lastContig = 0; + private int lastPosition = 0; + + private boolean finishedAlignedReads = false; + + + // this should probably take a SAM + public SamLocusIterator(final CloseableIterator samIterator) { + this.underlyingIterator = samIterator; + this.it = new NotPrimarySkippingIterator(samIterator); + } + + public Iterator iterator() { + return this; + } + + public void close() { + this.underlyingIterator.close(); + } + + private boolean samHasMore() { + return !finishedAlignedReads && it.hasCurrent(); + } + public boolean hasNext() { + return ((complete.size() > 0) || (accumulator.size() > 0) || (samHasMore()) || hasRemainingMaskBases()); + } + + private boolean hasRemainingMaskBases() { + if (mask == null) return false; + + // if there are more contigs in the mask, by definition some of them must have + // marked bases otherwise if we're in the last contig, but we're not at the last marked position, + // there is also more in the mask + return (lastContig <= mask.getMaxContig() || + (lastContig == mask.getMaxContig() && lastPosition <= mask.get(lastContig).nextSetBit(lastPosition+1))); + } + + public LocusInfo next() { + + // if we don't have any completed entries to return, try and make some! + while(complete.size() == 0 && samHasMore()) { + final SAMRecord rec = it.getCurrent(); + final String cigar = rec.getCigarString(); + + // as soon as we hit our first non-aligned read, we can stop! + if (cigar.equals("*")) { + this.finishedAlignedReads = true; + continue; + } + + // skip dupe reads, if so requested + if (!isIncludeDuplicates() && rec.getDuplicateReadFlag()) { it.advance(); continue; } + + // skip non-PF reads, if so requested + if (!isIncludeNonPfReads() && rec.getReadFailsVendorQualityCheckFlag()) { it.advance(); continue; } + + // when we switch contigs, emit everything in the accumulator + if (accumulator.size() > 0 && accumulator.getFirst().sequenceIndex != rec.getReferenceIndex()) { + while (accumulator.size() > 0) { + popLocus(); + } + } + + // pop off things we're not going to accumulate more coverage at the locus in question + while(accumulator.size() > 0 && accumulator.getFirst().position < rec.getAlignmentStart()) { + popLocus(); + } + + // check that it's a non-gapped alignment for now! + // TODO: handle gapped and clipped alignments + if (!cigar.matches("[0-9]+M")) { + System.out.println("Cannot deal with clipped or gapped alignments. CIGAR="+cigar); + System.exit(1); + } + + // at this point, either the list is empty or the head should + // be the same position as the first base of the read + + // interpret the CIGAR string and add the base info + for(int j=0; j < rec.getReadBases().length; j++) { + // if the position is empty, initialize it + if (j > accumulator.size() - 1) { + accumulator.add(new LocusInfo(rec.getReferenceIndex(), rec.getAlignmentStart() + j)); + } + + // if the quality score cutoff is met, accumulate the base info + if (rec.getBaseQualities()[j] >= getQualityScoreCutoff()) { + accumulator.get(j).add(rec.getReadBases()[j], rec.getBaseQualities()[j], rec.getReadNegativeStrandFlag()); + } + } + + + it.advance(); + } + + // if we have nothing to return to the user, and we're at the end of the SAM iterator, + // push everything into the complete queue + if (complete.size() == 0 && !samHasMore()) { + while(accumulator.size() > 0) { + popLocus(); + } + } + + // if there are completed entries, return those + if (complete.size() > 0) { + return complete.removeFirst(); + } else { + + // In this case... we're past the last read from SAM so see if we can + // fill out any more (zero coverage) entries from the mask + LocusInfo zeroResult = null; + while (zeroResult == null && lastContig <= mask.getMaxContig()) { + final int nextbit = mask.get(lastContig).nextSetBit(lastPosition+1); + + // try the next contig + if (nextbit == -1) { + lastContig++; + lastPosition = 0; + } else { + lastPosition = nextbit; + zeroResult = new LocusInfo(lastContig, lastPosition); + } + } + + return zeroResult; + } + } + + /** + * Pop the first entry from the LocusInfo accumulator into the complete queue. In addition, + * check the GenomeMask and if there are intervening mask positions between the last popped base and the one + * about to be popped, put those on the complete queue as well. + */ + private void popLocus() { + final LocusInfo li = accumulator.removeFirst(); + + // fill in any gaps based on our genome mask + final int liContig = li.getSequenceIndex(); + + // if we're not on the same contig, fill in the rest of the bits for the previous contig first... + if (lastContig < liContig) { + while (lastContig < liContig) { + int nextbit = 0; + + if (mask != null && mask.get(lastContig) != null) { + while (nextbit != -1) { + nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); + if (nextbit > -1) { + complete.addLast(new LocusInfo(lastContig, nextbit)); + lastPosition = nextbit; + } + } + } + lastPosition=0; + lastContig++; + } + } + + // now that we're on the same contig, fill in any unfilled positions + // if we have some bits in the mask to fill in... + if (mask != null && mask.get(lastContig) != null && lastPosition + 1 < li.getPosition()) { + while (lastPosition + 1 < li.getPosition()) { + + final int nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); + + // if there are no more mask bits, or the next mask bit is + // at or after the current data, just continue on + if (nextbit == -1 || nextbit >= li.getPosition()) { break; } + + // otherwise, pop on the desired empty locus info + complete.addLast(new LocusInfo(lastContig, nextbit)); + lastPosition = nextbit; + } + } + + // only add to the complete queue if it's in the mask (or we have no mask!) + if (mask == null || mask.get(li.getSequenceIndex(), li.getPosition())) { + complete.addLast(li); + } + + lastContig = liContig; + lastPosition = li.getPosition(); + + + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + // -------------------------------------------------------------------------------------------- + // Helper methods below this point... + // -------------------------------------------------------------------------------------------- + + public void setGenomeMask(final GenomeMask mask) { this.mask = mask; } + public GenomeMask getGenomeMask() { return this.mask; } + + public boolean isIncludeNonPfReads() { return includeNonPfReads; } + public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; } + + public boolean isIncludeDuplicates() { return includeDuplicates; } + public void setIncludeDuplicates(final boolean includeDuplicates) { this.includeDuplicates = includeDuplicates; } + + public int getQualityScoreCutoff() { return qualityScoreCutoff; } + public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; } + + +} diff --git a/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java b/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java new file mode 100644 index 0000000000..74dd1e12ae --- /dev/null +++ b/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java @@ -0,0 +1,203 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.sam.util.CloseableIterator; + +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.io.Closeable; + +/** + * Class for parsing text files where each line consists of fields separated by whitespace. + * Code is abstracted into this class so that we can optimize its performance over time. + * + * This class assumes that every line will have the same number of whitespace-separated "words" + * and that lines that start with "#" are comments and should be ignored. + * + * Classes that extend this parser can do so simply by implementing their own constructors and the + * readNextLine(), close(), and getFileName() methods. + * + * @author Kathleen Tibbetts + */ +public abstract class AbstractTextFileParser implements Iterable, CloseableIterator { + + private boolean treatGroupedDelimitersAsOne = true; // Whether multiple delimiters in succession should be treated as one + private byte nextLine[] = null; + private int wordCount = 0; /* The number of delimiter-separated "words" per line of the file. + We can save a little caclulation, or handle files with varying numbers of + words per line, by specifying this if known in advance */ + private boolean iterating = false; + + /** + * Closes this stream and releases any system resources associated with it. + */ + public abstract void close(); + + /** + * @return the next line of text from the underlying stream(s) or null if there is no next line + */ + protected abstract byte[] readNextLine(); + + /** + * @return the name(s) of the file(s) being parsed, or null if no name is available + */ + protected abstract String getFileName(); + + /** + * @return an iterator over a set of elements of type String[] + */ + public Iterator iterator() { + if (iterating) { + throw new IllegalStateException("iterator() method can only be called once, before the" + + "first call to hasNext()"); + } + nextLine = readNextLine(); + iterating = true; + return this; + } + + /** + * Returns true if the iteration has more elements. + * + * @return true if the iteration has more elements. Otherwise returns false. + */ + public boolean hasNext() { + // If this is the start of iteration, queue up the first item + if(!iterating) { + nextLine = readNextLine(); + iterating = true; + } + return nextLine != null; + } + + /** + * Returns the next element in the iteration. + * + * @return the next tlement in the iteration + * @throws java.util.NoSuchElementException + */ + public String[] next() { + + if (!hasNext()) { + throw new NoSuchElementException("Iteration from text file(s) " + + getFileName() + " has no more elements."); + } + + String[] result = parseLine(nextLine); + do { + nextLine = readNextLine(); + } + while (nextLine != null && isComment(nextLine)); + return result; + } + + /** + * This method represents the most efficient way (so far) to parse a line of whitespace-delimited text + * + * @param line the line to parse + * @return an array of all the "words" + */ + private String[] parseLine(byte line[]) { + + if (getWordCount() == 0) { + calculateWordCount(line); + } + String parts[] = new String[getWordCount()]; + boolean delimiter = true; + int index=0; + int start = 0; + + try + { + for (int i = 0; i < line.length; i++) { + if (isDelimiter(line[i])) { + if (!delimiter) { + parts[index++] = new String(line,start,i-start); + } + else if(!isTreatGroupedDelimitersAsOne()) { + parts[index++] = null; + } + delimiter=true; + } + else { + if (delimiter) start = i; + delimiter = false; + } + } + if (!delimiter) { + parts[index] = new String(line,start,line.length-start); + } + } + catch (ArrayIndexOutOfBoundsException e) { + throw new PicardException("Unexpected number of elements found when parsing file " + + this.getFileName() + ": " + index + ". Expected a maximum of " + + this.getWordCount() + " elements per line."); + } + return parts; + } + + /** + * Calculates the number of delimiter-separated "words" in a line and sets the value of wordCount + * + * @param line representative line from the file + */ + protected void calculateWordCount(byte line[]) { + int words = 0; + boolean delimiter = true; + for (byte b : line) { + if (isDelimiter(b)) { + if (delimiter && !isTreatGroupedDelimitersAsOne()) words++; + delimiter = true; + } else { + if (delimiter) words++; + delimiter = false; + } + } + setWordCount(words); + } + + /** + * Required method for Iterator API. + * + * @throws UnsupportedOperationException + */ + public void remove() { + throw new UnsupportedOperationException("Remove() not supported."); + } + + /** + * Determines whether a given line is a comment + * + * @param line the line to evaluate + * @return true if the line is a comment (and should be ignored) otherwise false + */ + protected boolean isComment(byte line[]) { + return line[0] == '#'; + } + + /** + * Determines whether a given character is a delimiter + * + * @param b the character to evaluate + * @return true if b is a delimiter; otherwise false + */ + protected boolean isDelimiter(byte b) { + return b == ' ' || b == '\t'; + } + + protected int getWordCount() { return wordCount; } + protected void setWordCount(int wordCount) { this.wordCount = wordCount; } + protected boolean isTreatGroupedDelimitersAsOne() { return treatGroupedDelimitersAsOne; } + protected void setTreatGroupedDelimitersAsOne(boolean treatGroupedDelimitersAsOne) { + this.treatGroupedDelimitersAsOne = treatGroupedDelimitersAsOne; + } +} diff --git a/lib/edu/mit/broad/picard/util/ArrayUtil.java b/lib/edu/mit/broad/picard/util/ArrayUtil.java new file mode 100644 index 0000000000..7ca7e38836 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/ArrayUtil.java @@ -0,0 +1,33 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +public class ArrayUtil { + + /** + * Reverse the elements of the given array in place + */ + public static void reverseArray(T[] array) { + for (int left=0, right=array.length-1; left files = new ArrayList(); + String currentFileName = null; + + /** + * Constructor. Opens up a buffered reader and reads the first line. + * + * @param files the file(s) to parse, in order + */ + public BasicTextFileParser(boolean treatGroupedDelimitersAsOne, File... files) { + if (files.length == 0) { + throw new IllegalArgumentException("At least one file must be specified."); + } + this.files.addAll(Arrays.asList(files)); + File f = this.files.remove(0); + currentFileName = f.getAbsolutePath(); + reader = new AsciiLineReader(IoUtil.openFileForReading(f)); + this.setTreatGroupedDelimitersAsOne(treatGroupedDelimitersAsOne); + } + + /** + * Constructor. In addition to opening and priming the files, it sets the number of + * whitespace-separated "words" per line. + * + * @param files the file(s) to parse + * @param wordCount number of whitespace-separated "words" per line + */ + public BasicTextFileParser(boolean treatGroupedDelimitersAsOne, int wordCount, File... files) { + this(treatGroupedDelimitersAsOne, files); + setWordCount(wordCount); + } + /** + * Workhorse method that reads the next line from the underlying reader + * + * @return String or null if there is no next line + */ + protected byte[] readNextLine() + { + try { + String line = reader.readLine(); + if (line != null) { + return line.getBytes(); + } + if (files.size() > 0) { + currentFileName = files.get(0).getAbsolutePath(); + reader = new AsciiLineReader(IoUtil.openFileForReading(files.remove(0))); + return readNextLine(); + } + return null; + } + catch(RuntimeIOException ioe) { + throw new PicardException("Error reading from file " + currentFileName, ioe); + } + } + + /** + * Closes the underlying stream + */ + public void close() { + if (reader != null) { + reader.close(); + } + } + + /** + * Gets the name of the file being parsed + * + * @return the name of the file being parsed + */ + protected String getFileName() { + return this.currentFileName; + } +} diff --git a/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java b/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java new file mode 100644 index 0000000000..9099016521 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java @@ -0,0 +1,42 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.util; + +import java.util.Iterator; + +import edu.mit.broad.sam.util.CloseableIterator; + +public class CloseableIteratorWrapper implements CloseableIterator { + Iterator wrappedIterator; + + public CloseableIteratorWrapper(Iterator wrappedIterator) { + this.wrappedIterator = wrappedIterator; + } + + @Override + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + @Override + public T next() { + return wrappedIterator.next(); + } + + @Override + public void remove() { + wrappedIterator.remove(); + } + + @Override + public void close() { + } +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/util/CloserUtil.java b/lib/edu/mit/broad/picard/util/CloserUtil.java new file mode 100644 index 0000000000..8b5f702ef3 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/CloserUtil.java @@ -0,0 +1,50 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +import java.util.List; +import java.util.Arrays; +import java.io.Closeable; +import java.io.IOException; + +/** + * Utility to close things that implement Closeable + * + * @author Kathleen Tibbetts + */ +public class CloserUtil { + + /** + * Calls close() on obj if it implements Closeable + * + * @param obj The potentially closeable object + */ + public static void close(Object obj) { + close(Arrays.asList(obj)); + } + + /** + * Calls close() on all elements of objs that implement Closeable + * + * @param objs A list of potentially closeable objects + */ + public static void close(List objs) { + for (Object o : objs) { + if (o instanceof Closeable) { + try { + ((Closeable)o).close(); + } + catch (IOException ioe) { + // Do nothing + } + } + } + } +} diff --git a/lib/edu/mit/broad/picard/util/CoordMath.java b/lib/edu/mit/broad/picard/util/CoordMath.java new file mode 100644 index 0000000000..981b494c07 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/CoordMath.java @@ -0,0 +1,59 @@ +/* + The Broad Institute + SOFTWARE COPYRIGHT NOTICE AGREEMENT + This software and its documentation are copyright 2005 by the + Broad Institute/Massachusetts Institute of Technology. All rights are + reserved. + + This software is supplied without any warranty or guaranteed support + whatsoever. Neither the Broad Institute nor MIT can be responsible for its + use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + + +/** + * Basic coordinate-based math utils, so it's encapsulated in one place! Assumes + * a one-based coordinate system and then 'end' is always inclusive + */ +public class CoordMath { + + /** Gets the length of an interval given the start and the end. */ + public static int getLength(int start, int end) { return (end - start) + 1; } + + /** Gets the start of an interval given the end and the length. */ + public static int getStart(int end, int length) { return end - length + 1; } + + /** Gets the end of an interval given the start and the length. */ + public static int getEnd(int start, int length) { return start + length - 1; } + + /** Checks to see if the two sets of coordinates have any overlap. */ + public static boolean overlaps(int start, int end, int start2, int end2) { + return (start2 >= start && start2 <= end) || (end2 >=start && end2 <= end) || + encloses(start2, end2, start, end); + } + + /** Returns true if the "inner" coords and totally enclosed by the "outer" coords. */ + public static boolean encloses(int outerStart, int outerEnd, int innerStart, int innerEnd) { + return innerStart >= outerStart && innerEnd <= outerEnd; + } + + /** + * Determines the amount of overlap between two coordinate ranges. Assumes that the two ranges + * actually do overlap and therefore may produce strange results when they do not! + */ + public static int getOverlap(int start, int end, int start2, int end2) { + return getLength(Math.max(start, start2), Math.min(end, end2)); + } + + /** + * Determines the read cycle number for the base + * + * @param isNegativeStrand true if the read is negative strand + * @param readLength + * @param readBaseIndex the 0-based index of the read base in question + */ + public static int getCycle(boolean isNegativeStrand, int readLength, final int readBaseIndex) { + return isNegativeStrand ? readLength - readBaseIndex : readBaseIndex + 1; + } +} diff --git a/lib/edu/mit/broad/picard/util/Coverage.java b/lib/edu/mit/broad/picard/util/Coverage.java new file mode 100644 index 0000000000..26212f4fc1 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/Coverage.java @@ -0,0 +1,36 @@ +package edu.mit.broad.picard.util; + +/** + * A simple class that is used to store the coverage information about an interval. + * + * @author Tim Fennell + */ +public class Coverage { + private Interval interval; + private short[] depths; + + /** Constructs a new coverage object for the provided mapping with the desired padding either side. */ + public Coverage(Interval i, int padding) { + this.interval = i; + this.depths = new short[interval.length() + 2*padding]; + } + + /** Adds a single point of depth at the desired offset into the coverage array. */ + public void addBase(int offset) { + if (offset >= 0 && offset < this.depths.length) { + this.depths[offset] += 1; + } + } + + /** Returns true if any base in the range has coverage of > 1 */ + public boolean hasCoverage() { + for (short s : depths) { + if (s > 1) return true; + } + + return false; + } + + /** Gets the coverage depths as an array of shorts. */ + public short[] getDepths() { return this.depths; } +} diff --git a/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java b/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java new file mode 100644 index 0000000000..c7ba6c6262 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java @@ -0,0 +1,88 @@ +package edu.mit.broad.picard.util; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.io.IoUtil; + +import java.io.File; +import java.util.Date; +import java.text.SimpleDateFormat; + +/** + * CommandLineProgram to create Picard analysis directory + * + * @author Kathleen Tibbetts + */ +public class CreateAnalysisDirectory extends CommandLineProgram { + + public static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy_MM_dd"); + + // The following attributes define the command-line arguments + @Usage(programVersion="1.0") + public String USAGE = + "Usage: " + getClass().getName() + " [options]\n\n" + + "Create a new Picard analysis directory.\n"; + + + @Option(shortName = "P", doc = "Analysis directory prefix. ") + public String PREFIX = "/seq/picard"; + + @Option(shortName = "F", doc = "The flowcell. ") + public String FLOWCELL; + + @Option(shortName = "A", doc = "The first cycle being analyzed. ") + public Integer FIRST_CYCLE = 1; + + @Option(shortName = "O", doc = "The last cycle being analyzed. ") + public Integer LAST_CYCLE; + + @Option(shortName = "R", doc = "The run date in the format MM/dd/yyyy. ") + public Date RUNDATE; + + @Option(shortName = "L", doc = "Lane number. ") + public Integer LANE; + + @Option(shortName="LIB", doc = "Library this analysis is for (e.g. 'Solexa-1234'). ") + public String LIBRARY; + + @Option(shortName="S", doc = "Analysis start date in the format MM/dd/yyyy") + public Date ANALYSIS_START_DATE; + + @Override + protected int doWork() { + if (PREFIX.charAt(PREFIX.length()-1) == '/') { + PREFIX = PREFIX.substring(0, PREFIX.length()-1); + } + IoUtil.assertDirectoryIsWritable(new File(PREFIX)); + String parts[] = { PREFIX, FLOWCELL, "C" + FIRST_CYCLE + "-" + LAST_CYCLE + "_" + + dateFormat.format(RUNDATE) + "_" + dateFormat.format(ANALYSIS_START_DATE), + String.valueOf(LANE), LIBRARY }; + String directory = null; + + for (int i = 1; i < parts.length; i++) { + StringBuilder sb = new StringBuilder(); + for (int j=0; j <= i; j++) { + sb.append(parts[j]).append("/"); + } + directory = sb.toString(); + File dir = new File(directory); + if (!dir.exists()) { + if (!dir.mkdir()) { + System.err.println("Unable to create directory " + directory); + return 1; + } + } + } + System.out.print(directory); + return 0; + } + + public static void main(String[] argv) { + CreateAnalysisDirectory cmd = new CreateAnalysisDirectory(); + cmd.QUIET = true; + System.exit(cmd.instanceMain(argv)); + } + + +} diff --git a/lib/edu/mit/broad/picard/util/FormatUtil.java b/lib/edu/mit/broad/picard/util/FormatUtil.java new file mode 100644 index 0000000000..94816c1fea --- /dev/null +++ b/lib/edu/mit/broad/picard/util/FormatUtil.java @@ -0,0 +1,135 @@ +package edu.mit.broad.picard.util; + +import edu.mit.broad.picard.PicardException; + +import java.security.InvalidParameterException; +import java.text.DateFormat; +import java.text.NumberFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.math.RoundingMode; + +/** + * Simple class used to format object values into a standard format for printing. + * + * @author Tim Fennell + */ +public class FormatUtil { + private DateFormat dateFormat; + private NumberFormat integerFormat; + private NumberFormat floatFormat; + + /** Constructs a new FormatUtil and initializes various internal formatters. */ + public FormatUtil() { + this.dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + + this.integerFormat = NumberFormat.getIntegerInstance(); + this.integerFormat.setGroupingUsed(false); + + this.floatFormat = NumberFormat.getNumberInstance(); + this.floatFormat.setGroupingUsed(false); + this.floatFormat.setMaximumFractionDigits(6); + this.floatFormat.setRoundingMode(RoundingMode.HALF_DOWN); + } + + /** Formats a short to an integer string. */ + public String format(short value) { return this.integerFormat.format(value); } + + /** Formats an int to an integer string. */ + public String format(int value) { return this.integerFormat.format(value); } + + /** Formats a long to an integer string. */ + public String format(long value) { return this.integerFormat.format(value); } + + /** Formats a float to a floating point string. */ + public String format(float value) {return this.floatFormat.format(value); } + + /** Formats a double to a floating point string. */ + public String format(double value) {return this.floatFormat.format(value); } + + /** Formats an enum to the String representation of an enum. */ + public String format(Enum value) { return value.name(); } + + /** Formats a date to a date string without time. */ + public String format(Date value) { return this.dateFormat.format(value); } + + /** Formats a boolean value to a String. */ + public String format(boolean value) { if (value) return "Y"; else return "N"; } + + /** Attempts to determine the type of value and format it appropriately. */ + public String format(Object value) { + if (value == null) return ""; + if (value instanceof Short) return format( ((Short) value).shortValue() ); + if (value instanceof Integer) return format( ((Integer) value).intValue() ); + if (value instanceof Long) return format( ((Long) value).longValue() ); + if (value instanceof Float) return format( ((Float) value).floatValue() ); + if (value instanceof Double) return format( ((Double) value).doubleValue() ); + if (value instanceof Enum) return format( ((Enum) value) ); + if (value instanceof Date) return format( ((Date) value) ); + if (value instanceof Boolean) return format( ((Boolean) value).booleanValue() ); + return value.toString(); + } + + /////////////////////////////////////////////////////////////////////////// + // Parsing methods + /////////////////////////////////////////////////////////////////////////// + + /** Parses a String into a short. */ + public short parseShort(String value) { return Short.parseShort(value); } + + /** Parses a String into an int. */ + public int parseInt(String value) { return Integer.parseInt(value); } + + /** Parses a String into a long. */ + public long parseLong(String value) { return Long.parseLong(value); } + + /** Parses a String into a float. */ + public float parseFloat(String value) { return Float.parseFloat(value); } + + /** Parses a String into a double. */ + public double parseDouble(String value) { return Double.parseDouble(value); } + + /** Parses a String into an Enum of the given type. */ + public E parseEnum(String value, Class type) { return (E) Enum.valueOf(type, value); } + + /** Parses a String into a date. */ + public Date parseDate(String value) { + try { + return this.dateFormat.parse(value); + } + catch (ParseException pe) { + throw new PicardException("Could not parse value as date: " + value, pe); + } + } + + /** Parses a String into a boolean. */ + public boolean parseBoolean(String value) { + if (value == null || value.length() == 0) return false; + char ch = Character.toUpperCase(value.charAt(0)); + + return (ch == 'Y'); + } + + /** + * Attempts to determine the correct parse method to call based on the desired + * return type and then parses the String and returns the value. + * + * @param value the String value to be parsed + * @param returnType the desired return type + * @return an object of the returnType + */ + public Object parseObject(String value, Class returnType) { + if (returnType == Short.class || returnType == Short.TYPE) return parseShort(value); + if (returnType == Integer.class || returnType == Integer.TYPE) return parseInt(value); + if (returnType == Long.class || returnType == Long.TYPE) return parseLong(value); + if (returnType == Float.class || returnType == Float.TYPE) return parseFloat(value); + if (returnType == Double.class || returnType == Double.TYPE) return parseDouble(value); + if (returnType == Boolean.class || returnType == Boolean.TYPE) return parseBoolean(value); + if (returnType == Date.class) return parseDate(value); + if (Enum.class.isAssignableFrom(returnType)) return parseEnum(value, (Class)returnType); + if (returnType == String.class) return value; + + throw new InvalidParameterException("Don't know how to convert a String to a " + returnType.getName()); + } +} diff --git a/lib/edu/mit/broad/picard/util/Histogram.java b/lib/edu/mit/broad/picard/util/Histogram.java new file mode 100644 index 0000000000..3d1f3f8078 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/Histogram.java @@ -0,0 +1,152 @@ +package edu.mit.broad.picard.util; + +import edu.mit.broad.picard.util.Histogram.Bin; + +import java.util.TreeMap; + +/** + * Class for computing and accessing histogram type data. Stored internally in + * a sorted Map so that keys can be iterated in order. + * + * @author Tim Fennell + */ +public class Histogram extends TreeMap { + private String binLabel = "BIN"; + private String valueLabel = "VALUE"; + private double count = 0; + private Double mean; + + /** Constructs a new Histogram with default bin and value labels. */ + public Histogram() { } + + /** Constructs a new Histogram with supplied bin and value labels. */ + public Histogram(String binLabel, String valueLabel) { + this.binLabel = binLabel; + this.valueLabel = valueLabel; + } + + /** Represents a bin in the Histogram. */ + public class Bin { + private final K id; + private double value = 0; + + /** Constructs a new bin with the given ID. */ + private Bin(K id) { this.id = id; } + + /** Gets the ID of this bin. */ + public K getId() { return id; } + + /** Gets the value in the bin. */ + public double getValue() { return value; } + + /** Returns the String format for the value in the bin. */ + public String toString() { return String.valueOf(this.value); } + + /** Checks the equality of the bin by ID and value. */ + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Bin bin = (Bin) o; + + if (Double.compare(bin.value, value) != 0) return false; + if (!id.equals(bin.id)) return false; + + return true; + } + + public double getIdValue() { + if (id instanceof Number) { + return ((Number) id).doubleValue(); + } else { + throw new UnsupportedOperationException("getIdValue only supported for Histogram"); + } + } + } + + /** Prefill the histogram with the supplied set of bins. */ + public void prefillBins(K... ids) { + for (K id : ids) { + put(id, new Bin(id)); + } + } + + /** Increments the value in the designated bin by 1. */ + public void increment(K id) { + increment(id, 1d); + } + + /** Increments the value in the designated bin by the supplied increment. */ + public void increment(K id, double increment) { + Bin bin = get(id); + if (bin == null) { + bin = new Bin(id); + put(id, bin); + } + + bin.value += increment; + count += increment; + mean = null; + } + + public String getBinLabel() { return binLabel; } + public void setBinLabel(String binLabel) { this.binLabel = binLabel; } + + public String getValueLabel() { return valueLabel; } + public void setValueLabel(String valueLabel) { this.valueLabel = valueLabel; } + + /** Checks that the labels and values in the two histograms are identical. */ + public boolean equals(Object o) { + return o != null && + (o instanceof Histogram) && + ((Histogram) o).binLabel.equals(this.binLabel) && + ((Histogram) o).valueLabel.equals(this.valueLabel) && + super.equals(o); + } + + public double getMean() { + if (mean == null) { + double total = 0; + for (Bin bin : values()) { + total += bin.getValue() * bin.getIdValue(); + } + + mean = total / count; + } + + return mean; + } + + public double getStandardDeviation() { + double total = 0; + for (Bin bin : values()) { + total += bin.getValue() * bin.getIdValue() * bin.getIdValue(); + } + + return Math.sqrt((total / count) - (getMean() * getMean())); + } + + public double getMedian() { + double total = 0; + double halfCount = count / 2; + for (Bin bin : values()) { + total += bin.getValue(); + if (total >= halfCount) { + return bin.getIdValue(); + } + } + return 0; + } + + public double getMin() { + return firstEntry().getValue().getIdValue(); + } + + public double getMax() { + return lastEntry().getValue().getIdValue(); + } + + public double getCount() { + return count; + } +} diff --git a/lib/edu/mit/broad/picard/util/Interval.java b/lib/edu/mit/broad/picard/util/Interval.java new file mode 100644 index 0000000000..79a0918073 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/Interval.java @@ -0,0 +1,139 @@ +package edu.mit.broad.picard.util; + +import edu.mit.broad.picard.PicardException; + +import java.util.List; +import java.util.Collection; + +/** + * Represents a simple interval on a sequence. Coordinates are 1-based closed ended. + * + * @author Tim Fennell + */ +public class Interval implements Comparable, Cloneable { + private String sequence; + private int start; + private int end; + private boolean negativeStrand; + private String name; + + /** + * Constructs an interval with the supplied sequence and start and end. If the end + * position is less than the start position an exception is thrown. + * + * @param sequence the name of the sequence + * @param start the start position of the interval on the sequence + * @param end the end position of the interval on the sequence + */ + public Interval(String sequence, int start, int end) { + this.sequence = sequence; + this.start = start; + this.end = end; + + if (this.end < this.start) throw new IllegalArgumentException("start must be less than or equal to end!"); + } + + /** + * Constructs an interval with the supplied sequence and start, end, strand and name. + * If the end position is less than the start position an exception is thrown. + * + * @param sequence the name of the sequence + * @param start the start position of the interval on the sequence + * @param end the end position of the interval on the sequence + * @param negative true to indicate negative strand, false otherwise + * @param name the name (possibly null) of the interval + * + */ + public Interval(String sequence, int start, int end, boolean negative, String name) { + this(sequence, start, end); + this.negativeStrand = negative; + this.name = name; + } + + /** Gets the name of the sequence on which the interval resides. */ + public String getSequence() { return sequence; } + + /** Gets the 1-based start position of the interval on the sequence. */ + public int getStart() { return start; } + + /** Gets the 1-based closed-ended end position of the interval on the sequence. */ + public int getEnd() { return end; } + + /** Returns true if the interval is on the negative strand, otherwise false. */ + public boolean isNegativeStrand() { return this.negativeStrand; } + + /** Returns true if the interval is on the positive strand, otherwise false. */ + public boolean isPositiveStrand() { return !this.negativeStrand; } + + /** Returns the name of the interval, possibly null. */ + public String getName() { return this.name; } + + /** Returns true if this interval overlaps the other interval, otherwise false. */ + public boolean intersects(Interval other) { + return (this.getSequence().equals(other.getSequence()) && + CoordMath.overlaps(this.start, this.end, other.start, other.end)); + } + + /** Returns true if this interval overlaps the other interval, otherwise false. */ + public boolean abuts(Interval other) { + return this.getSequence().equals(other.getSequence()) && + (this.start == other.end + 1 || other.start == this.end + 1); + } + + /** Gets the length of this interval. */ + public int length() { return this.end - this.start + 1; } + + /** Counts the total number of bases a collection of intervals. */ + public static long countBases(Collection intervals) { + long total = 0; + for (Interval i : intervals) { + total += i.length(); + } + + return total; + } + + + /** + * Sort based on sequence.compareTo, then start pos, then end pos + * with null objects coming lexically last + */ + public int compareTo(Interval that) { + if (that == null) return -1; // nulls last + + int result = this.getSequence().compareTo(that.getSequence()); + if (result == 0) { + if (this.start == that.start) { + result = this.end - that.end; + } + else { + result = this.start - that.start; + } + } + + return result; + } + + /** Equals method that agrees with {@link #compareTo(Interval)}. */ + public boolean equals(Interval that) { + return (this.compareTo(that) == 0); + } + + public int hashCode() { + int result; + result = sequence.hashCode(); + result = 31 * result + (start ^ (start >>> 32)); + result = 31 * result + (end ^ (end >>> 32)); + return result; + } + + public String toString() { + return getSequence() + ":" + start + "-" + end; + } + + @Override + public Interval clone() { + try { return (Interval) super.clone(); } + catch (CloneNotSupportedException cnse) { throw new PicardException("That's unpossible", cnse); } + } +} diff --git a/lib/edu/mit/broad/picard/util/IntervalTree.java b/lib/edu/mit/broad/picard/util/IntervalTree.java new file mode 100644 index 0000000000..8821b25d0e --- /dev/null +++ b/lib/edu/mit/broad/picard/util/IntervalTree.java @@ -0,0 +1,1304 @@ +/* + * $Id: IntervalTree.java 51146 2007-11-05 17:48:24Z tsharpe $ + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.picard.util; + +import java.util.ConcurrentModificationException; +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * A Red-Black tree with intervals for keys. + * Not thread-safe, and cannot be made so. + * + * 7/24/2008: This was copied from the tedUtils package. + * IMPORTANT!!! It has been modified to use the Reseq way of + * handling coordinates (end-inclusive). + * + * @author tsharpe + */ +public class IntervalTree implements Iterable> +{ + /** + * Return the number of intervals in the tree. + * @return The number of intervals. + */ + public int size() + { + return mRoot == null ? 0 : mRoot.getSize(); + } + + /** + * Remove all entries. + */ + public void clear() + { + mRoot = null; + } + + /** + * Put a new interval into the tree (or update the value associated with an existing interval). + * If the interval is novel, the special sentinel value is returned. + * @param interval The interval. + * @param value The associated value. + * @return The old value associated with that interval, or the sentinel. + */ + public V put( HalfOpenInterval interval, V value ) + { + return put(interval.getStart(),interval.getEnd(),value); + } + + /** + * Put a new interval into the tree (or update the value associated with an existing interval). + * If the interval is novel, the special sentinel value is returned. + * @param start The interval's start. + * @param end The interval's end. + * @param value The associated value. + * @return The old value associated with that interval, or the sentinel. + */ + @SuppressWarnings("null") + public V put( int start, int end, V value ) + { + if ( start > end ) + throw new IllegalArgumentException("Start cannot exceed end."); + + V result = mSentinel; + + if ( mRoot == null ) + { + mRoot = new Node(start,end,value); + } + else + { + Node parent = null; + Node node = mRoot; + int cmpVal = 0; + + while ( node != null ) + { + parent = node; // last non-null node + cmpVal = node.compare(start,end); + if ( cmpVal == 0 ) + { + break; + } + + node = cmpVal < 0 ? node.getLeft() : node.getRight(); + } + + if ( cmpVal == 0 ) + { + result = parent.setValue(value); + } + else + { + if ( cmpVal < 0 ) + { + mRoot = parent.insertLeft(start,end,value,mRoot); + } + else + { + mRoot = parent.insertRight(start,end,value,mRoot); + } + } + } + + return result; + } + + /** + * Remove an interval from the tree. If the interval does not exist in the tree the + * special sentinel value is returned. + * @param interval The interval to remove. + * @return The value associated with that interval, or the sentinel. + */ + public V remove( HalfOpenInterval interval ) + { + return remove(interval.getStart(),interval.getEnd()); + } + + /** + * Remove an interval from the tree. If the interval does not exist in the tree the + * special sentinel value is returned. + * @param start The interval's start. + * @param end The interval's end. + * @return The value associated with that interval, or the sentinel. + */ + public V remove( int start, int end ) + { + V result = mSentinel; + Node node = mRoot; + + while ( node != null ) + { + int cmpVal = node.compare(start,end); + if ( cmpVal == 0 ) + { + result = node.getValue(); + mRoot = node.remove(mRoot); + break; + } + + node = cmpVal < 0 ? node.getLeft() : node.getRight(); + } + + return result; + } + + /** + * Find an interval. + * @param interval The interval sought. + * @return The Node that represents that interval, or null. + */ + public Node find( HalfOpenInterval interval ) + { + return find(interval.getStart(),interval.getEnd()); + } + + /** + * Find an interval. + * @param start The interval's start. + * @param end The interval's end. + * @return The Node that represents that interval, or null. + */ + public Node find( int start, int end ) + { + Node node = mRoot; + + while ( node != null ) + { + int cmpVal = node.compare(start,end); + if ( cmpVal == 0 ) + { + break; + } + + node = cmpVal < 0 ? node.getLeft() : node.getRight(); + } + + return node; + } + + /** + * Find the nth interval in the tree. + * @param idx The rank of the interval sought (from 0 to size()-1). + * @return The Node that represents the nth interval. + */ + public Node findByIndex( int idx ) + { + return Node.findByRank(mRoot,idx+1); + } + + /** + * Find the rank of the specified interval. If the specified interval is not in the + * tree, then -1 is returned. + * @param interval The interval for which the index is sought. + * @return The rank of that interval, or -1. + */ + public int getIndex( HalfOpenInterval interval ) + { + return getIndex(interval.getStart(),interval.getEnd()); + } + + /** + * Find the rank of the specified interval. If the specified interval is not in the + * tree, then -1 is returned. + * @param start The interval's start. + * @param end The interval's end. + * @return The rank of that interval, or -1. + */ + public int getIndex( int start, int end ) + { + return Node.getRank(mRoot,start,end) - 1; + } + + /** + * Find the least interval in the tree. + * @return The earliest interval, or null if the tree is empty. + */ + public Node min() + { + Node result = null; + Node node = mRoot; + + while ( node != null ) + { + result = node; + node = node.getLeft(); + } + + return result; + } + + /** + * Find the earliest interval in the tree greater than or equal to the specified interval. + * @param interval The interval sought. + * @return The earliest >= interval, or null if there is none. + */ + public Node min( HalfOpenInterval interval ) + { + return min(interval.getStart(),interval.getEnd()); + } + + /** + * Find the earliest interval in the tree greater than or equal to the specified interval. + * @param start The interval's start. + * @param end The interval's end. + * @return The earliest >= interval, or null if there is none. + */ + @SuppressWarnings("null") + public Node min( int start, int end ) + { + Node result = null; + Node node = mRoot; + int cmpVal = 0; + + while ( node != null ) + { + result = node; + cmpVal = node.compare(start,end); + if ( cmpVal == 0 ) + { + break; + } + + node = cmpVal < 0 ? node.getLeft() : node.getRight(); + } + + if ( cmpVal > 0 ) + { + result = result.getNext(); + } + + return result; + } + + /** + * Find the earliest interval in the tree that overlaps the specified interval. + * @param interval The interval sought. + * @return The earliest overlapping interval, or null if there is none. + */ + public Node minOverlapper( HalfOpenInterval interval ) + { + return minOverlapper(interval.getStart(),interval.getEnd()); + } + + /** + * Find the earliest interval in the tree that overlaps the specified interval. + * @param start The interval's start. + * @param end The interval's end. + * @return The earliest overlapping interval, or null if there is none. + */ + public Node minOverlapper( int start, int end ) + { + Node result = null; + Node node = mRoot; + + if ( node != null && node.getMaxEnd() >= start ) + { + while ( true ) + { + if ( node.getStart() <= end && start <= node.getEnd() ) + { // this node overlaps. there might be a lesser overlapper down the left sub-tree. + // no need to consider the right sub-tree: even if there's an overlapper, if won't be minimal + result = node; + node = node.getLeft(); + if ( node == null || node.getMaxEnd() < start ) + break; // no left sub-tree or all nodes end too early + } + else + { // no overlap. if there might be a left sub-tree overlapper, consider the left sub-tree. + Node left = node.getLeft(); + if ( left != null && left.getMaxEnd() >= start ) + { + node = left; + } + else + { // left sub-tree cannot contain an overlapper. consider the right sub-tree. + if ( node.getStart() > end ) + break; // everything in the right sub-tree is past the end of the query interval + + node = node.getRight(); + if ( node == null || node.getMaxEnd() < start ) + break; // no right sub-tree or all nodes end too early + } + } + } + } + + return result; + } + + /** + * Find the greatest interval in the tree. + * @return The latest interval, or null if the tree is empty. + */ + public Node max() + { + Node result = null; + Node node = mRoot; + + while ( node != null ) + { + result = node; + node = node.getRight(); + } + + return result; + } + + /** + * Find the latest interval in the tree less than or equal to the specified interval. + * @param interval The interval sought. + * @return The latest <= interval, or null if there is none. + */ + public Node max( HalfOpenInterval interval ) + { + return max(interval.getStart(),interval.getEnd()); + } + + /** + * Find the latest interval in the tree less than or equal to the specified interval. + * @param start The interval's start. + * @param end The interval's end. + * @return The latest >= interval, or null if there is none. + */ + @SuppressWarnings("null") + public Node max( int start, int end ) + { + Node result = null; + Node node = mRoot; + int cmpVal = 0; + + while ( node != null ) + { + result = node; + cmpVal = node.compare(start,end); + if ( cmpVal == 0 ) + { + break; + } + + node = cmpVal < 0 ? node.getLeft() : node.getRight(); + } + + if ( cmpVal < 0 ) + { + result = result.getPrev(); + } + + return result; + } + + /** + * Return an iterator over the entire tree. + * @return An iterator. + */ + public Iterator> iterator() + { + return new FwdIterator(min()); + } + + /** + * Return an iterator over all intervals greater than or equal to the specified interval. + * @param interval The minimum interval. + * @return An iterator. + */ + public Iterator> iterator( HalfOpenInterval interval ) + { + return new FwdIterator(min(interval.getStart(),interval.getEnd())); + } + + /** + * Return an iterator over all intervals greater than or equal to the specified interval. + * @param start The interval's start. + * @param end The interval's end. + * @return An iterator. + */ + public Iterator> iterator( int start, int end ) + { + return new FwdIterator(min(start,end)); + } + + /** + * Return an iterator over all intervals overlapping the specified range. + * @param start The range start. + * @param end The range end. + * @return An iterator. + */ + public Iterator> overlappers( int start, int end ) + { + return new OverlapIterator(start,end); + } + + /** + * Return an iterator over the entire tree that returns intervals in reverse order. + * @return An iterator. + */ + public Iterator> reverseIterator() + { + return new RevIterator(max()); + } + + /** + * Return an iterator over all intervals less than or equal to the specified interval, in reverse order. + * @param interval The maximum interval. + * @return An iterator. + */ + public Iterator> reverseIterator( HalfOpenInterval interval ) + { + return new RevIterator(max(interval.getStart(),interval.getEnd())); + } + + /** + * Return an iterator over all intervals less than or equal to the specified interval, in reverse order. + * @param start The interval's start. + * @param end The interval's end. + * @return An iterator. + */ + public Iterator> reverseIterator( int start, int end ) + { + return new RevIterator(max(start,end)); + } + + /** + * Get the special sentinel value that will be used to signal novelty when putting a new interval + * into the tree, or to signal "not found" when removing an interval. This is null by default. + * @return The sentinel value. + */ + public V getSentinel() + { + return mSentinel; + } + + /** + * Set the special sentinel value that will be used to signal novelty when putting a new interval + * into the tree, or to signal "not found" when removing an interval. + * @param sentinel The new sentinel value. + * @return The old sentinel value. + */ + public V setSentinel( V sentinel ) + { + V result = mSentinel; + mSentinel = sentinel; + return result; + } + + void removeNode( Node node ) + { + mRoot = node.remove(mRoot); + } + + private Node mRoot; + private V mSentinel; + + public static class Node + implements HalfOpenInterval + { + Node( int start, int end, V1 value ) + { + mStart = start; + mEnd = end; + mValue = value; + mSize = 1; + mMaxEnd = mEnd; + mIsBlack = true; + } + + Node( Node parent, int start, int end, V1 value ) + { + mParent = parent; + mStart = start; + mEnd = end; + mValue = value; + mMaxEnd = mEnd; + mSize = 1; + } + + public int getStart() + { + return mStart; + } + + public int getEnd() + { + return mEnd; + } + + public int getLength() + { + return mEnd - mStart; + } + + public int getRelationship( HalfOpenInterval interval ) + { + int result = 0; + if ( mStart < interval.getStart() ) + result = HalfOpenInterval.HAS_LESSER_PART; + if ( mEnd > interval.getEnd() ) + result |= HalfOpenInterval.HAS_GREATER_PART; + if ( mStart < interval.getEnd() && interval.getStart() < mEnd ) + result |= HalfOpenInterval.HAS_OVERLAPPING_PART; + return result; + } + + public boolean isAdjacent( HalfOpenInterval interval ) + { + return mStart == interval.getEnd() || mEnd == interval.getStart(); + } + + public V1 getValue() + { + return mValue; + } + + public V1 setValue( V1 value ) + { + V1 result = mValue; + mValue = value; + return result; + } + + int getSize() + { + return mSize; + } + + int getMaxEnd() + { + return mMaxEnd; + } + + Node getLeft() + { + return mLeft; + } + + Node insertLeft( int start, int end, V1 value, Node root ) + { + mLeft = new Node(this,start,end,value); + return insertFixup(mLeft,root); + } + + Node getRight() + { + return mRight; + } + + Node insertRight( int start, int end, V1 value, Node root ) + { + mRight = new Node(this,start,end,value); + return insertFixup(mRight,root); + } + + Node getNext() + { + Node result; + + if ( mRight != null ) + { + result = mRight; + while ( result.mLeft != null ) + { + result = result.mLeft; + } + } + else + { + Node node = this; + result = mParent; + while ( result != null && node == result.mRight ) + { + node = result; + result = result.mParent; + } + } + + return result; + } + + Node getPrev() + { + Node result; + + if ( mLeft != null ) + { + result = mLeft; + while ( result.mRight != null ) + { + result = result.mRight; + } + } + else + { + Node node = this; + result = mParent; + while ( result != null && node == result.mLeft ) + { + node = result; + result = result.mParent; + } + } + + return result; + } + + boolean wasRemoved() + { + return mSize == 0; + } + + Node remove( Node root ) + { + if ( mSize == 0 ) + { + throw new IllegalStateException("Entry was already removed."); + } + + if ( mLeft == null ) + { + if ( mRight == null ) + { // no children + if ( mParent == null ) + { + root = null; + } + else if ( mParent.mLeft == this ) + { + mParent.mLeft = null; + fixup(mParent); + + if ( mIsBlack ) + root = removeFixup(mParent,null,root); + } + else + { + mParent.mRight = null; + fixup(mParent); + + if ( mIsBlack ) + root = removeFixup(mParent,null,root); + } + } + else + { // single child on right + root = spliceOut(mRight,root); + } + } + else if ( mRight == null ) + { // single child on left + root = spliceOut(mLeft,root); + } + else + { // two children + Node next = getNext(); + root = next.remove(root); + + // put next into tree in same position as this, effectively removing this + if ( (next.mParent = mParent) == null ) + root = next; + else if ( mParent.mLeft == this ) + mParent.mLeft = next; + else + mParent.mRight = next; + + if ( (next.mLeft = mLeft) != null ) + { + mLeft.mParent = next; + } + + if ( (next.mRight = mRight) != null ) + { + mRight.mParent = next; + } + + next.mIsBlack = mIsBlack; + next.mSize = mSize; + } + + mSize = 0; + return root; + } + + // backwards comparison! compares start+end to this. + int compare( int start, int end ) + { + int result = 0; + + if ( start > mStart ) + result = 1; + else if ( start < mStart ) + result = -1; + else if ( end > mEnd ) + result = 1; + else if ( end < mEnd ) + result = -1; + + return result; + } + + @SuppressWarnings("null") + static Node getNextOverlapper( Node node, int start, int end ) + { + do + { + Node nextNode = node.mRight; + if ( nextNode != null && nextNode.mMaxEnd >= start ) + { + node = nextNode; + while ( (nextNode = node.mLeft) != null && nextNode.mMaxEnd >= start ) + node = nextNode; + } + else + { + nextNode = node; + while ( (node = nextNode.mParent) != null && node.mRight == nextNode ) + nextNode = node; + } + + if ( node != null && node.mStart > end ) + node = null; + } + while ( node != null && !(node.mStart <= end && start <= node.mEnd) ); + + return node; + } + + static Node findByRank( Node node, int rank ) + { + while ( node != null ) + { + int nodeRank = node.getRank(); + if ( rank == nodeRank ) + break; + + if ( rank < nodeRank ) + { + node = node.mLeft; + } + else + { + node = node.mRight; + rank -= nodeRank; + } + } + + return node; + } + + static int getRank( Node node, int start, int end ) + { + int rank = 0; + + while ( node != null ) + { + int cmpVal = node.compare(start,end); + if ( cmpVal < 0 ) + { + node = node.mLeft; + } + else + { + rank += node.getRank(); + if ( cmpVal == 0 ) + return rank; // EARLY RETURN!!! + + node = node.mRight; + } + } + + return 0; + } + + private int getRank() + { + int result = 1; + if ( mLeft != null ) + result = mLeft.mSize + 1; + return result; + } + + private Node spliceOut( Node child, Node root ) + { + if ( (child.mParent = mParent) == null ) + { + root = child; + child.mIsBlack = true; + } + else + { + if ( mParent.mLeft == this ) + mParent.mLeft = child; + else + mParent.mRight = child; + fixup(mParent); + + if ( mIsBlack ) + root = removeFixup(mParent,child,root); + } + + return root; + } + + private Node rotateLeft( Node root ) + { + Node child = mRight; + + int childSize = child.mSize; + child.mSize = mSize; + mSize -= childSize; + + if ( (mRight = child.mLeft) != null ) + { + mRight.mParent = this; + mSize += mRight.mSize; + } + + if ( (child.mParent = mParent) == null ) + root = child; + else if ( this == mParent.mLeft ) + mParent.mLeft = child; + else + mParent.mRight = child; + + child.mLeft = this; + mParent = child; + + setMaxEnd(); + child.setMaxEnd(); + + return root; + } + + private Node rotateRight( Node root ) + { + Node child = mLeft; + + int childSize = child.mSize; + child.mSize = mSize; + mSize -= childSize; + + if ( (mLeft = child.mRight) != null ) + { + mLeft.mParent = this; + mSize += mLeft.mSize; + } + + if ( (child.mParent = mParent) == null ) + root = child; + else if ( this == mParent.mLeft ) + mParent.mLeft = child; + else + mParent.mRight = child; + + child.mRight = this; + mParent = child; + + setMaxEnd(); + child.setMaxEnd(); + + return root; + } + + private void setMaxEnd() + { + mMaxEnd = mEnd; + if ( mLeft != null ) + mMaxEnd = Math.max(mMaxEnd,mLeft.mMaxEnd); + if ( mRight != null ) + mMaxEnd = Math.max(mMaxEnd,mRight.mMaxEnd); + } + + private static void fixup( Node node ) + { + do + { + node.mSize = 1; + node.mMaxEnd = node.mEnd; + if ( node.mLeft != null ) + { + node.mSize += node.mLeft.mSize; + node.mMaxEnd = Math.max(node.mMaxEnd,node.mLeft.mMaxEnd); + } + if ( node.mRight != null ) + { + node.mSize += node.mRight.mSize; + node.mMaxEnd = Math.max(node.mMaxEnd,node.mRight.mMaxEnd); + } + } + while ( (node = node.mParent) != null ); + } + + private static Node insertFixup( Node daughter, Node root ) + { + Node mom = daughter.mParent; + fixup(mom); + + while( mom != null && !mom.mIsBlack ) + { + Node gramma = mom.mParent; + Node auntie = gramma.mLeft; + if ( auntie == mom ) + { + auntie = gramma.mRight; + if ( auntie != null && !auntie.mIsBlack ) + { + mom.mIsBlack = true; + auntie.mIsBlack = true; + gramma.mIsBlack = false; + daughter = gramma; + } + else + { + if ( daughter == mom.mRight ) + { + root = mom.rotateLeft(root); + mom = daughter; + } + mom.mIsBlack = true; + gramma.mIsBlack = false; + root = gramma.rotateRight(root); + break; + } + } + else + { + if ( auntie != null && !auntie.mIsBlack ) + { + mom.mIsBlack = true; + auntie.mIsBlack = true; + gramma.mIsBlack = false; + daughter = gramma; + } + else + { + if ( daughter == mom.mLeft ) + { + root = mom.rotateRight(root); + mom = daughter; + } + mom.mIsBlack = true; + gramma.mIsBlack = false; + root = gramma.rotateLeft(root); + break; + } + } + mom = daughter.mParent; + } + root.mIsBlack = true; + return root; + } + + private static Node removeFixup( Node parent, Node node, Node root ) + { + do + { + if ( node == parent.mLeft ) + { + Node sister = parent.mRight; + if ( !sister.mIsBlack ) + { + sister.mIsBlack = true; + parent.mIsBlack = false; + root = parent.rotateLeft(root); + sister = parent.mRight; + } + if ( (sister.mLeft == null || sister.mLeft.mIsBlack) && (sister.mRight == null || sister.mRight.mIsBlack) ) + { + sister.mIsBlack = false; + node = parent; + } + else + { + if ( sister.mRight == null || sister.mRight.mIsBlack ) + { + sister.mLeft.mIsBlack = true; + sister.mIsBlack = false; + root = sister.rotateRight(root); + sister = parent.mRight; + } + sister.mIsBlack = parent.mIsBlack; + parent.mIsBlack = true; + sister.mRight.mIsBlack = true; + root = parent.rotateLeft(root); + node = root; + } + } + else + { + Node sister = parent.mLeft; + if ( !sister.mIsBlack ) + { + sister.mIsBlack = true; + parent.mIsBlack = false; + root = parent.rotateRight(root); + sister = parent.mLeft; + } + if ( (sister.mLeft == null || sister.mLeft.mIsBlack) && (sister.mRight == null || sister.mRight.mIsBlack) ) + { + sister.mIsBlack = false; + node = parent; + } + else + { + if ( sister.mLeft == null || sister.mLeft.mIsBlack ) + { + sister.mRight.mIsBlack = true; + sister.mIsBlack = false; + root = sister.rotateLeft(root); + sister = parent.mLeft; + } + sister.mIsBlack = parent.mIsBlack; + parent.mIsBlack = true; + sister.mLeft.mIsBlack = true; + root = parent.rotateRight(root); + node = root; + } + } + parent = node.mParent; + } + while ( parent != null && node.mIsBlack ); + + node.mIsBlack = true; + return root; + } + + private Node mParent; + private Node mLeft; + private Node mRight; + private int mStart; + private int mEnd; + private V1 mValue; + private int mSize; + private int mMaxEnd; + private boolean mIsBlack; + } + + public class FwdIterator + implements Iterator> + { + public FwdIterator( Node node ) + { + mNext = node; + } + + public boolean hasNext() + { + return mNext != null; + } + + public Node next() + { + if ( mNext == null ) + { + throw new NoSuchElementException("No next element."); + } + + if ( mNext.wasRemoved() ) + { + mNext = min(mNext.getStart(),mNext.getEnd()); + if ( mNext == null ) + throw new ConcurrentModificationException("Current element was removed, and there are no more elements."); + } + mLast = mNext; + mNext = mNext.getNext(); + return mLast; + } + + public void remove() + { + if ( mLast == null ) + { + throw new IllegalStateException("No entry to remove."); + } + + removeNode(mLast); + mLast = null; + } + + private Node mNext; + private Node mLast; + } + + public class RevIterator + implements Iterator> + { + public RevIterator( Node node ) + { + mNext = node; + } + + public boolean hasNext() + { + return mNext != null; + } + + public Node next() + { + if ( mNext == null ) + throw new NoSuchElementException("No next element."); + if ( mNext.wasRemoved() ) + { + mNext = max(mNext.getStart(),mNext.getEnd()); + if ( mNext == null ) + throw new ConcurrentModificationException("Current element was removed, and there are no more elements."); + } + mLast = mNext; + mNext = mNext.getPrev(); + return mLast; + } + + public void remove() + { + if ( mLast == null ) + { + throw new IllegalStateException("No entry to remove."); + } + + removeNode(mLast); + mLast = null; + } + + private Node mNext; + private Node mLast; + } + + public class OverlapIterator + implements Iterator> + { + public OverlapIterator( int start, int end ) + { + mNext = minOverlapper(start,end); + mStart = start; + mEnd = end; + } + + public boolean hasNext() + { + return mNext != null; + } + + public Node next() + { + if ( mNext == null ) + { + throw new NoSuchElementException("No next element."); + } + + if ( mNext.wasRemoved() ) + { + throw new ConcurrentModificationException("Current element was removed."); + } + + mLast = mNext; + mNext = Node.getNextOverlapper(mNext,mStart,mEnd); + return mLast; + } + + public void remove() + { + if ( mLast == null ) + { + throw new IllegalStateException("No entry to remove."); + } + + removeNode(mLast); + mLast = null; + } + + private Node mNext; + private Node mLast; + private int mStart; + private int mEnd; + } + + public static class ValuesIterator + implements Iterator + { + public ValuesIterator( Iterator> itr ) + { + mItr = itr; + } + + public boolean hasNext() + { + return mItr.hasNext(); + } + + public V1 next() + { + return mItr.next().getValue(); + } + + public void remove() + { + mItr.remove(); + } + + private Iterator> mItr; + } +} + +/** + * Semi-open interval on the integer number line. + * Turf covered runs from the start value inclusive, up to, but not including, the end value. + * + * @author tsharpe + * @version $Revision: 51146 $ + */ +interface HalfOpenInterval +{ + // bit-wise definitions from which the other constants are composed + static final int HAS_LESSER_PART = 1; + static final int HAS_OVERLAPPING_PART = 2; + static final int HAS_GREATER_PART = 4; + + static final int IS_ADJACENT_AND_EMPTY = 0; + static final int IS_STRICTLY_LESS = HAS_LESSER_PART; // 1 + static final int IS_SUBSET = HAS_OVERLAPPING_PART; // 2 + static final int IS_LEFT_OVERHANGING_OVERLAPPER = HAS_LESSER_PART | HAS_OVERLAPPING_PART; // 3 + static final int IS_STRICTLY_GREATER = HAS_GREATER_PART; // 4 + // there is no value that equals 5, since that would imply overhanging on left and right without overlapping + static final int IS_RIGHT_OVERHANGING_OVERLAPPER = HAS_GREATER_PART | HAS_OVERLAPPING_PART; // 6 + static final int IS_SUPERSET = HAS_LESSER_PART | HAS_OVERLAPPING_PART | HAS_GREATER_PART; // 7 + + /** + * Returns the starting point of the interval. + * @return The start. + */ + int getStart(); + + /** + * Returns the ending point of the interval. + * The interval is not regarded as including this point. + * @return The end. + */ + int getEnd(); + + /** + * End - start. + */ + int getLength(); + + /** + * Returns a constant that describes the relationship of this interval + * to a specified interval with regard to position on the number line. + * @param interval The interval to compare this one to. + * @return One of the IS_* constants defined above. + */ + int getRelationship( HalfOpenInterval interval ); + + /** + * Returns true if this interval ends where the specified interval starts, + * or vice versa. + * @param interval The interval to compare this one to. + * @return True, if adjacent. + */ + boolean isAdjacent( HalfOpenInterval interval ); +} diff --git a/lib/edu/mit/broad/picard/util/ListMap.java b/lib/edu/mit/broad/picard/util/ListMap.java new file mode 100644 index 0000000000..bee27cc18d --- /dev/null +++ b/lib/edu/mit/broad/picard/util/ListMap.java @@ -0,0 +1,24 @@ +package edu.mit.broad.picard.util; + +import java.util.List; +import java.util.HashMap; +import java.util.ArrayList; + +/** + * A Map class that holds a list of entries under each key instead of a single entry, and + * provides utility methods for adding an entry under a key. + * + * @author Tim Fennell + */ +public class ListMap extends HashMap> { + /** Adds a single value to the list stored under a key. */ + public void add(K key, V value) { + List values = get(key); + if (values == null) { + values = new ArrayList(); + put(key, values); + } + + values.add(value); + } +} diff --git a/lib/edu/mit/broad/picard/util/Log.java b/lib/edu/mit/broad/picard/util/Log.java new file mode 100644 index 0000000000..43a628bdb1 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/Log.java @@ -0,0 +1,182 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +import java.io.PrintStream; +import java.util.Arrays; + +/** + *

    A wafer thin wrapper around System.out that uses var-args to make it + * much more efficient to call the logging methods in without having to + * surround every call site with calls to Log.isXXXEnabled(). All the methods on this + * class take a variable length list of arguments and, only if logging is enabled for + * the level and channel being logged to, will those arguments be toString()'d and + * appended together.

    + * + * @author Tim Fennell + */ +public final class Log { + /** Enumeration for setting log levels. */ + public static enum LogLevel { ERROR, WARNING, INFO, DEBUG } + + private static LogLevel globalLogLevel = LogLevel.DEBUG; + + private final Class clazz; + private final String className; + private final LogLevel level = globalLogLevel; + private final PrintStream out = System.out; + + /** + * Private constructor + */ + private Log(final Class clazz) { + this.clazz = clazz; + this.className = clazz.getSimpleName(); + } + + /** + * Get a Log instance to perform logging within the Class specified. Returns an instance + * of this class which wraps an instance of the commons logging Log class. + * @param clazz the Class which is going to be doing the logging + * @return a Log instance with which to log + */ + public static Log getInstance(final Class clazz) { + return new Log(clazz); + } + + public static void setGlobalLogLevel(final LogLevel logLevel) { + globalLogLevel = logLevel; + } + + /** Returns true if the specified log level is enabled otherwise false. */ + public final boolean isEnabled(final LogLevel level) { + return level.ordinal() <= this.level.ordinal(); + } + + /** + * Private method that does the actual printing of messages to a PrintWriter. Outputs the log level, + * class name and parts followed by the stack trace if a throwable is provided. + * + * @param level the Log level being logged at + * @param throwable a Throwable if one is available otherwise null + * @param parts the parts of the message to be concatenated + */ + private final void emit(final LogLevel level, final Throwable throwable, final Object... parts) { + if (isEnabled(level)) { + this.out.print(level.name()); + this.out.print('\t'); + this.out.print(this.className); + this.out.print('\t'); + + for (final Object part : parts) { + if (part != null && part.getClass().isArray()) { + final Class component = part.getClass().getComponentType(); + if (component.equals(Boolean.TYPE)) this.out.print(Arrays.toString( (boolean[]) part)); + else if (component.equals(Byte.TYPE)) this.out.print(Arrays.toString( (byte[]) part)); + else if (component.equals(Character.TYPE)) this.out.print(Arrays.toString( (char[]) part)); + else if (component.equals(Double.TYPE)) this.out.print(Arrays.toString( (double[]) part)); + else if (component.equals(Float.TYPE)) this.out.print(Arrays.toString( (float[]) part)); + else if (component.equals(Integer.TYPE)) this.out.print(Arrays.toString( (int[]) part)); + else if (component.equals(Long.TYPE)) this.out.print(Arrays.toString( (long[]) part)); + else if (component.equals(Short.TYPE)) this.out.print(Arrays.toString( (short[]) part)); + else this.out.print(Arrays.toString( (Object[]) part)); + } + else { + this.out.print(part); + } + } + + this.out.println(); + + // Print out the exception if there is one + if (throwable != null) { + throwable.printStackTrace(this.out); + } + } + } + + /** + * Logs a Throwable and optional message parts at level error. + * @param throwable an instance of Throwable that should be logged with stack trace + * @param messageParts zero or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void error(final Throwable throwable, final Object... messageParts) { + emit(LogLevel.ERROR, throwable, messageParts); + } + + /** + * Logs a Throwable and optional message parts at level warn. + * @param throwable an instance of Throwable that should be logged with stack trace + * @param messageParts zero or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void warn(final Throwable throwable, final Object... messageParts) { + emit(LogLevel.WARNING, throwable, messageParts); + } + + /** + * Logs a Throwable and optional message parts at level info. + * @param throwable an instance of Throwable that should be logged with stack trace + * @param messageParts zero or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void info(final Throwable throwable, final Object... messageParts) { + emit(LogLevel.INFO, throwable, messageParts); + } + + /** + * Logs a Throwable and optional message parts at level debug. + * @param throwable an instance of Throwable that should be logged with stack trace + * @param messageParts zero or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void debug(final Throwable throwable, final Object... messageParts) { + emit(LogLevel.DEBUG, throwable, messageParts); + } + + // Similar methods, but without Throwables, follow + + /** + * Logs one or more message parts at level error. + * @param messageParts one or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void error(final Object... messageParts) { + emit(LogLevel.ERROR, null, messageParts); + } + + /** + * Logs one or more message parts at level warn. + * @param messageParts one or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void warn(final Object... messageParts) { + emit(LogLevel.WARNING, null, messageParts); + } + + /** + * Logs one or more message parts at level info. + * @param messageParts one or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void info(final Object... messageParts) { + emit(LogLevel.INFO, null, messageParts); + } + + /** + * Logs one or more message parts at level debug. + * @param messageParts one or more objects which should be combined, by calling toString() + * to form the log message. + */ + public final void debug(final Object... messageParts) { + emit(LogLevel.DEBUG, null, messageParts); + } +} diff --git a/lib/edu/mit/broad/picard/util/MathUtil.java b/lib/edu/mit/broad/picard/util/MathUtil.java new file mode 100644 index 0000000000..0cf6de4b7e --- /dev/null +++ b/lib/edu/mit/broad/picard/util/MathUtil.java @@ -0,0 +1,33 @@ +package edu.mit.broad.picard.util; + +/** + * General math utilities + * + * @author Tim Fennell + */ +public class MathUtil { + /** Calculated the mean of an array of doubles. */ + public static double mean(double[] in, int start, int stop) { + double total = 0; + for (int i=start; i { + private Map>> cache = new HashMap>>(); + private final int lhsBuffer; + private final int rhsBuffer; + + /** + * Constructs an overlap detector. + * @param lhsBuffer the amount by which to "trim" coordinates of mappings on the left + * hand side when calculating overlaps + * @param rhsBuffer the amount by which to "trim" coordinates of mappings on the right + * hand side when calculating overlaps + */ + public OverlapDetector(int lhsBuffer, int rhsBuffer) { + this.lhsBuffer = lhsBuffer; + this.rhsBuffer = rhsBuffer; + } + + /** Adds a mapping to the set of mappings against which to match candidates. */ + public void addLhs(T object, Interval interval) { + Object seqId = interval.getSequence(); + + IntervalTree> tree = this.cache.get(seqId); + if (tree == null) { + tree = new IntervalTree>(); + this.cache.put(seqId, tree); + } + + int start = interval.getStart() + this.lhsBuffer; + int end = interval.getEnd() - this.lhsBuffer; + + Set objects = new HashSet(); + objects.add(object); + if (start <= end) // Don't put in sequences that have no overlappable bases + { + Set alreadyThere = tree.put(start, end, objects); + if (alreadyThere != null) + { + alreadyThere.add(object); + tree.put(start, end, alreadyThere); + } + } + } + + /** Adds all items to the overlap detector. */ + public void addAll(List objects, List intervals) { + if (objects.size() != intervals.size()) { + throw new IllegalArgumentException("Objects and intervals must be the same size."); + } + + for (int i=0; i getOverlaps(Interval rhs) { + Collection matches = new ArrayList(); + + Object seqId = rhs.getSequence(); + IntervalTree> tree = this.cache.get(seqId); + int start = rhs.getStart() + this.rhsBuffer; + int end = rhs.getEnd() - this.rhsBuffer; + + if (tree != null && start <= end) + { + Iterator>> it = tree.overlappers(start, end); + while (it.hasNext()) + { + IntervalTree.Node> node = it.next(); + matches.addAll(node.getValue()); + } + } + + return matches; + } + + /** Gets all the objects that could be returned by the overlap detector. */ + public Collection getAll() { + Collection all = new HashSet(); + for (IntervalTree> tree : this.cache.values()) { + for (IntervalTree.Node> node : tree) { + all.addAll(node.getValue()); + } + } + + return all; + } +} diff --git a/lib/edu/mit/broad/picard/util/PasteParser.java b/lib/edu/mit/broad/picard/util/PasteParser.java new file mode 100644 index 0000000000..2b785a52f4 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/PasteParser.java @@ -0,0 +1,132 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.sam.util.CloseableIterator; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * Class to merge files horizontally (like the Unix paste command), so that the first line of each file + * is merged together in one big line, then the second lines, etc. + * + * @author Kathleen Tibbetts + */ +public class PasteParser implements Iterable, CloseableIterator{ + + private final CloseableIterator[] iterators; + private boolean iterating = false; + private String[][] next = null; + + /** + * Constructor + * + * @param iterators The iterators containing the files to merge together + */ + public PasteParser(CloseableIterator... iterators) { + this.iterators = iterators; + } + + /** + * Merges the "next" line from each of the underying iterators and returns an array of the results. + * + * @return An array of the lines from each iterator + * @throws PicardException if the files are not exhausted at the same time + */ + protected String[][] readNextLine() { + String result[][] = new String[iterators.length][]; + boolean oneFinished = false; + boolean oneNotFinished = false; + + for (int i = 0; i < iterators.length; i++) { + if (!iterators[i].hasNext()) { + oneFinished = true; + } + else { + result[i] = iterators[i].next(); + oneNotFinished = true; + } + } + if (oneFinished) { + if (oneNotFinished) { + throw new PicardException("Mismatched file lengths in PasteParser"); + } + else { + return null; + } + } + return result; + } + + /** + * Closes the underlying iterators. + */ + public void close() { + for (CloseableIterator iterator : iterators) { + iterator.close(); + } + } + + /** + * Required method for Iterator API. + * + * @throws UnsupportedOperationException + */ + public void remove() { + throw new UnsupportedOperationException("Remove() not supported."); + } + + /** + * Returns an iterator over a set of elements of type BustardReadData. + * + * @return an iterator over a set of elements of type BustardReadData + */ + public Iterator iterator() { + if (iterating) { + throw new IllegalStateException("iterator() method can only be called once, before the" + + "first call to hasNext()"); + } + next = readNextLine(); + iterating = true; + return this; + } + + /** + * Returns true if the iteration has more elements. + * + * @return true if the iteration has more elements. Otherwise returns false. + */ + public boolean hasNext() { + if (!iterating) { + next = readNextLine(); + iterating = true; + } + return next != null; + } + + /** + * Returns the next element in the iteration. + * + * @return the next element in the iteration + * @throws java.util.NoSuchElementException + */ + public String[][] next() { + + if (!hasNext()) { + throw new NoSuchElementException("Iteration has no more elements."); + } + + String[][] result = next; + next = readNextLine(); + return result; + } +} diff --git a/lib/edu/mit/broad/picard/util/PeekableIterator.java b/lib/edu/mit/broad/picard/util/PeekableIterator.java new file mode 100644 index 0000000000..eae31253df --- /dev/null +++ b/lib/edu/mit/broad/picard/util/PeekableIterator.java @@ -0,0 +1,65 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright Jan 22, 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +package edu.mit.broad.picard.util; + +import edu.mit.broad.sam.util.CloseableIterator; + +/** + * Generic Closable Iterator that allows you to peek at the next value before calling next + */ +public class PeekableIterator implements CloseableIterator { + private CloseableIterator iterator; + private Object nextObject; + + /** Constructs a new iterator that wraps the supplied iterator. */ + public PeekableIterator(CloseableIterator iterator) { + this.iterator = iterator; + advance(); + } + + /** Closes the underlying iterator. */ + public void close() { + this.iterator.close(); + } + + /** True if there are more items, in which case both next() and peek() will return a value. */ + public boolean hasNext() { + return this.nextObject != null; + } + + /** Returns the next object and advances the iterator. */ + public Object next() { + Object retval = this.nextObject; + advance(); + return retval; + } + + /** + * Returns the next object but does not advance the iterator. Subsequent calls to peek() + * and next() will return the same object. + */ + public Object peek(){ + return this.nextObject; + } + + private void advance(){ + if (this.iterator.hasNext()) { + this.nextObject = iterator.next(); + } + else { + this.nextObject = null; + } + } + + /** Unsupported Operation. */ + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } +} diff --git a/lib/edu/mit/broad/picard/util/ProcessExecutor.java b/lib/edu/mit/broad/picard/util/ProcessExecutor.java new file mode 100644 index 0000000000..6655e37cd2 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/ProcessExecutor.java @@ -0,0 +1,121 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +package edu.mit.broad.picard.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.ThreadFactory; + +import edu.mit.broad.picard.PicardException; + +/** + * Utility class that will execute sub processes via Runtime.getRuntime().exec(...) and read + * off the output from stderr and stdout of the sub process. This implementation uses a different + * thread to read each stream: the current thread for stdout and another, internal thread for + * stderr. This utility is able to handle concurrent executions, spawning as many threads as + * are required to handle the concurrent load. + * + * @author Doug Voet + */ +public class ProcessExecutor { + private static Log log = Log.getInstance(ProcessExecutor.class); + private static ExecutorService executorService = Executors.newCachedThreadPool(new ThreadFactory() { + @Override + public Thread newThread(Runnable r) { + return new Thread(r, "ProcessExecutor Thread"); + } + }); + + /** + * Executes the command via Runtime.getRuntime().exec() then writes stderr to log.error + * and stdout to log.info and blocks until the command is complete. + * + * @see Runtime#exec(String) + * + * @param command command string + * @return return code of command + */ + public static int execute(String command) { + try { + Process process = Runtime.getRuntime().exec(command); + return readStreamsAndWaitFor(process); + } catch (Throwable t) { + throw new PicardException("Unexpected exception executing [" + StringUtil.join(" ", command) + "]", t); + } + } + + /** + * Executes the command via Runtime.getRuntime().exec() then writes stderr to log.error + * and stdout to log.info and blocks until the command is complete. + * + * @see Runtime#exec(String[]) + * + * @param commandParts command string + * @return return code of command + */ + public static int execute(String[] commandParts) { + try { + Process process = Runtime.getRuntime().exec(commandParts); + return readStreamsAndWaitFor(process); + } catch (Throwable t) { + throw new PicardException("Unexpected exception executing [" + StringUtil.join(" ", commandParts) + "]", t); + } + } + + private static int readStreamsAndWaitFor(Process process) + throws InterruptedException, ExecutionException { + Future stderrReader = executorService.submit(new LogErrorProcessOutputReader(process.getErrorStream())); + new LogInfoProcessOutputReader(process.getInputStream()).run(); + // wait for stderr reader to be done + stderrReader.get(); + return process.waitFor(); + } + + /** + * Runnable that reads off the given stream and logs it somewhere. + */ + private static abstract class ProcessOutputReader implements Runnable { + private BufferedReader reader; + public ProcessOutputReader(InputStream stream) { + reader = new BufferedReader(new InputStreamReader(stream)); + } + + @Override + public void run() { + try { + String line; + while ((line = reader.readLine()) != null) { + log(line); + } + } catch (IOException e) { + throw new PicardException("Unexpected exception reading from process stream", e); + } + } + + protected abstract void log(String message); + } + + private static class LogErrorProcessOutputReader extends ProcessOutputReader { + public LogErrorProcessOutputReader(InputStream stream) { super(stream); } + @Override protected void log(String message) { log.error(message); } + } + + private static class LogInfoProcessOutputReader extends ProcessOutputReader { + public LogInfoProcessOutputReader(InputStream stream) { super(stream); } + @Override protected void log(String message) { log.info(message); } + } +} diff --git a/lib/edu/mit/broad/picard/util/RExecutor.java b/lib/edu/mit/broad/picard/util/RExecutor.java new file mode 100644 index 0000000000..7faa23a9c6 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/RExecutor.java @@ -0,0 +1,93 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ + +package edu.mit.broad.picard.util; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.io.IoUtil; + +/** + * Util class for executing R scripts. + * + * @author Doug Voet + */ +public class RExecutor { + private static final String R_EXE = "Rscript"; + + /** + * Executes the given R script that is stored in a file on the classpath. The script file + * is read from the classpath and written to a temp file then executed by a call to Rscript. + * Blocks until the R script is complete. + * + * @param rScriptName the fully qualified name of the classpath resource of the script + * @param arguments any arguments required by the script + * @return the return code of the R process + */ + public static int executeFromClasspath(String rScriptName, String... arguments) { + File scriptFile = writeScriptFile(rScriptName); + int returnCode = executeFromFile(scriptFile, arguments); + scriptFile.delete(); + return returnCode; + } + + /** + * Executes the given R script that is stored in a file by a call to Rscript. + * Blocks until the R script is complete. + * + * @param scriptFile the file object for the script + * @param arguments any arguments required by the script + * @return the return code of the R process + */ + public static int executeFromFile(File scriptFile, String... arguments) { + String[] command = new String[arguments.length + 2]; + command[0] = R_EXE; + command[1] = scriptFile.getAbsolutePath(); + System.arraycopy(arguments, 0, command, 2, arguments.length); + return ProcessExecutor.execute(command); + } + + /** + * Writes the classpath resource named by rScriptName to the temp dir. + */ + private static File writeScriptFile(String rScriptName) { + InputStream scriptStream = null; + OutputStream scriptFileStream = null; + try { + scriptStream = RExecutor.class.getClassLoader().getResourceAsStream(rScriptName); + if (scriptStream == null) { + throw new IllegalArgumentException("Script [" + rScriptName + "] not found in classpath"); + } + File scriptFile = File.createTempFile("script", ".R"); + scriptFileStream = IoUtil.openFileForWriting(scriptFile); + IoUtil.copyStream(scriptStream, scriptFileStream); + return scriptFile; + } catch (IOException e) { + throw new PicardException("Unexpected exception creating R script file", e); + } finally { + if (scriptStream != null) { + try { + scriptStream.close(); + } catch (IOException e) { + } + } + if (scriptFileStream != null) { + try { + scriptFileStream.close(); + } catch (IOException e) { + } + } + } + } +} diff --git a/lib/edu/mit/broad/picard/util/SamPairUtil.java b/lib/edu/mit/broad/picard/util/SamPairUtil.java new file mode 100644 index 0000000000..4d78019dbc --- /dev/null +++ b/lib/edu/mit/broad/picard/util/SamPairUtil.java @@ -0,0 +1,74 @@ +package edu.mit.broad.picard.util; + +import edu.mit.broad.sam.SAMRecord; + +/** + * Utility mthods for pairs of SAMRecords + */ +public class SamPairUtil { + + // TODO: KT and TF say this is more complicated than what I have here + public static boolean isProperPair(final SAMRecord firstEnd, final SAMRecord secondEnd, boolean jumpingLibrary) { + if (firstEnd.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { + return false; + } + if (!firstEnd.getReferenceName().equals(secondEnd.getReferenceName())) { + return false; + } + if (firstEnd.getReadNegativeStrandFlag() == secondEnd.getReadNegativeStrandFlag()) { + return false; + } + final SAMRecord positiveEnd; + final SAMRecord negativeEnd; + if (firstEnd.getReadNegativeStrandFlag()) { + positiveEnd = secondEnd; + negativeEnd = firstEnd; + } else { + positiveEnd = firstEnd; + negativeEnd = secondEnd; + } + if (!jumpingLibrary) { + return positiveEnd.getAlignmentStart() < negativeEnd.getAlignmentStart() + negativeEnd.getReadBases().length; + } else { + return negativeEnd.getAlignmentStart() < positiveEnd.getAlignmentStart() + positiveEnd.getReadBases().length; + } + } + + public static int computeInsertSize(final SAMRecord firstEnd, final SAMRecord secondEnd) { + if (firstEnd.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { + return 0; + } + if (!firstEnd.getReferenceName().equals(secondEnd.getReferenceName())) { + return 0; + } + int firstEnd5PrimePosition = firstEnd.getReadNegativeStrandFlag()? firstEnd.getAlignmentEnd(): firstEnd.getAlignmentStart(); + int secondEnd5PrimePosition = secondEnd.getReadNegativeStrandFlag()? secondEnd.getAlignmentEnd(): secondEnd.getAlignmentStart(); + return secondEnd5PrimePosition - firstEnd5PrimePosition; + } + + /** + * Write the mate info for two SAMRecords + */ + public static void setMateInfo(final SAMRecord samRecord, final SAMRecord mate) { + if (!samRecord.getMateUnmappedFlag()) { + samRecord.setMateReferenceName(mate.getReferenceName()); + samRecord.setMateAlignmentStart(mate.getAlignmentStart()); + samRecord.setMateNegativeStrandFlag(mate.getReadNegativeStrandFlag()); + } else { + samRecord.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); + samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + samRecord.setMateUnmappedFlag(true); + } + if (!mate.getMateUnmappedFlag()) { + mate.setMateReferenceName(samRecord.getReferenceName()); + mate.setMateAlignmentStart(samRecord.getAlignmentStart()); + mate.setMateNegativeStrandFlag(samRecord.getReadNegativeStrandFlag()); + } else { + mate.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); + mate.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + mate.setMateUnmappedFlag(true); + } + } + + +} diff --git a/lib/edu/mit/broad/picard/util/SequenceUtil.java b/lib/edu/mit/broad/picard/util/SequenceUtil.java new file mode 100644 index 0000000000..d0a7937f8b --- /dev/null +++ b/lib/edu/mit/broad/picard/util/SequenceUtil.java @@ -0,0 +1,62 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +public class SequenceUtil { + /** + * Calculate the reverse complement of the specified sequence + * (Stolen from Reseq) + * + * @param sequenceData + * @return reverse complement + */ + public static String reverseComplement(String sequenceData) { + + final char[] original = sequenceData.toCharArray(); + final char[] complement = new char[original.length]; + + for (int i=0, j=complement.length-1; i 90) lhs -= 32; + if (rhs > 90) rhs -= 32; + } + + return lhs == rhs; + } + + /** + * returns true if the value of base represents a no call + */ + public static boolean isNoCall(byte base) { + return base == 'N' || base == 'n' || base == '.'; + } + +} diff --git a/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java b/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java new file mode 100644 index 0000000000..fbc4798b92 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java @@ -0,0 +1,121 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +import edu.mit.broad.sam.util.SortingCollection; +import edu.mit.broad.sam.util.RuntimeIOException; + +import java.util.Comparator; +import java.nio.ByteBuffer; +import java.io.OutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.File; + +/** + * Factory to create new String SortingCollections + * + * @author Kathleen Tibbetts + */ +public class StringSortingCollectionFactory { + + private static final File TEMP_DIR = new File(System.getProperty("java.io.tmpdir"), "StringSortingCollectionFactory"); + private static final int MAX_RECORDS_IN_RAM = 20000; + + private StringSortingCollectionFactory() { + } + + public static SortingCollection newCollection() { + return SortingCollection.newInstance( + String.class, new StringCodec(), new StringComparator(), MAX_RECORDS_IN_RAM, TEMP_DIR); + } + + static class StringCodec implements SortingCollection.Codec { + ByteBuffer byteBuffer = ByteBuffer.allocate(4); + OutputStream os; + InputStream is; + + /** Returns a new StringCodec. */ + public SortingCollection.Codec clone() { + return new StringCodec(); + } + + /** + * Where to write encoded output + * + * @param os the output stream to encode output + */ + public void setOutputStream(final OutputStream os) { + this.os = os; + } + + /** + * Where to read encoded input from + * + * @param is where to read encoded input from + */ + public void setInputStream(final InputStream is) { + this.is = is; + } + + /** + * Write object to file + * + * @param val what to write + */ + public void encode(final String val) { + try { + byteBuffer.clear(); + byteBuffer.putInt(val.length()); + os.write(byteBuffer.array()); + os.write(val.getBytes()); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Read the next record from the input stream and convert into a java object. + * + * @return null if no more records. Should throw exception if EOF is encountered in the middle of + * a record. + */ + public String decode() { + try { + byteBuffer.clear(); + int bytesRead = is.read(byteBuffer.array()); + if (bytesRead == -1) { + return null; + } + if (bytesRead != 4) { + throw new RuntimeException("Unexpected EOF in middle of record"); + } + byteBuffer.limit(4); + final int length = byteBuffer.getInt(); + final byte[] buf = new byte[length]; + bytesRead = is.read(buf); + if (bytesRead != length) { + throw new RuntimeException("Unexpected EOF in middle of record"); + } + return new String(buf); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + } + + static class StringComparator implements Comparator { + + public int compare(final String s, final String s1) { + return s.compareTo(s1); + } + } + +} diff --git a/lib/edu/mit/broad/picard/util/StringUtil.java b/lib/edu/mit/broad/picard/util/StringUtil.java new file mode 100644 index 0000000000..2cf15de820 --- /dev/null +++ b/lib/edu/mit/broad/picard/util/StringUtil.java @@ -0,0 +1,108 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.util; + +/** + * Utilities that are useful when dealing with Strings. + * + * @author Tim Fennell + */ +public class StringUtil { + /** + * Return input string with newlines inserted to ensure that all lines + * have length <= maxLineLength. if a word is too long, it is simply broken + * at maxLineLength. Does not handle tabs intelligently (due to implementer laziness). + */ + public static String wordWrap(String s, int maxLineLength) { + String[] lines = s.split("\n"); + StringBuilder sb = new StringBuilder(); + for (String line: lines) { + if (sb.length() > 0) { + sb.append("\n"); + } + sb.append(wordWrapSingleLine(line, maxLineLength)); + } + if (s.endsWith("\n")) { + sb.append("\n"); + } + return sb.toString(); + } + + public static String wordWrapSingleLine(String s, int maxLineLength) { + if (s.length() <= maxLineLength) { + return s; + } + StringBuilder sb = new StringBuilder(); + int startCopyFrom = 0; + while (startCopyFrom < s.length()) { + int lastSpaceIndex = startCopyFrom; + int i; + // Find break point (if it exists) + for (i = startCopyFrom; i < s.length() && i - startCopyFrom < maxLineLength; ++i) { + if (Character.isWhitespace(s.charAt(i))) { + lastSpaceIndex = i; + } + } + if (i - startCopyFrom < maxLineLength) { + lastSpaceIndex = i; + } + // Include any trailing whitespace + for (; lastSpaceIndex < s.length() && Character.isWhitespace(s.charAt(lastSpaceIndex)); ++lastSpaceIndex) {} + if (sb.length() > 0) { + sb.append("\n"); + } + // Handle situation in which there is no word break. Just break the word in the middle. + if (lastSpaceIndex == startCopyFrom) { + lastSpaceIndex = i; + } + sb.append(s.substring(startCopyFrom, lastSpaceIndex)); + startCopyFrom = lastSpaceIndex; + } + return sb.toString(); + } + + /** + * + * @param separator String to interject between each string in strings arg + * @param strings List of strings to be joined. + * @return String that concatenates each item of strings arg, with separator btw each of them. + */ + public static String join(String separator, String... strings) { + if (strings.length == 0) { + return ""; + } + StringBuilder ret = new StringBuilder(strings[0]); + for (int i = 1; i < strings.length; ++i) { + ret.append(separator); + ret.append(strings[i]); + } + return ret.toString(); + } + + /** + * Checks that a String doesn't contain one or more characters of interest. + * + * @param s the String to check + * @param chars the characters to check for + * @return String the input String for convenience + * @throws IllegalArgumentException if the String contains one or more of the characters + */ + public static String assertCharactersNotInString(final String s, final char... chars) { + for (char ch : s.toCharArray()) { + for (int i=0; ib is a delimiter; otherwise false + */ + protected boolean isDelimiter(byte b) { + return b == '\t'; + } +} diff --git a/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java b/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java new file mode 100644 index 0000000000..5f44c972a2 --- /dev/null +++ b/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java @@ -0,0 +1,172 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.variation; + +import java.io.*; +import java.util.*; +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.util.TabbedTextFileParser; +import edu.mit.broad.picard.util.Log; + +/** + * Generates a binary version of the data for all dbSnps from a UCSU snp###.txt file. Files with SNP data + * can be downloaded here: http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/. See KnownVariantCodec.java + * for binary file format. + */ +public class DbSnpFileGenerator { + // Codes from the DbSnp file that we will handle. All others are ignored. + // Package visibility for testing purposes. + static final String snp = "single"; // code in DbSnp file for a SNP + static final String insertion = "insertion"; // code in DbSnp file for an insertion + static final String deletion = "deletion"; // code in DbSnp file for a deletion + static final String indel = "in-del"; // code in DbSnp file for an insertion/deletion + + private File snpFile; + private File seqDictionaryFile; + private Map sequenceToIndex = new HashMap(); + private List dictionary; + private BinaryCodec codec; + private KnownVariantCodec kvCodec = new KnownVariantCodec(); + private Map> sequenceToSnps; + + private final Log log = Log.getInstance(DbSnpFileGenerator.class); + + /** + * Protected constructor so we can use a temporary file during testing + * @param snpFile The UCSC dbSnp file + * @param seqDictionaryFile The Sequence Dictionary + * @param tempOutputFile The binary file to write to + */ + DbSnpFileGenerator(File snpFile, File seqDictionaryFile, File tempOutputFile) { + this.snpFile = snpFile; + this.seqDictionaryFile = seqDictionaryFile; + this.codec = new BinaryCodec(new DataOutputStream(IoUtil.openFileForWriting(tempOutputFile))); + } + + /** + * Writes the full binary dbSnp file and calls close on the BinaryCodec. + */ + public void writeDbSnpFile() { + kvCodec.encode(KnownVariantCodec.MAGIC_NUMBER, codec); + writeReferenceSequences(); + writeDbSnpRecords(); + codec.close(); + } + + /** + * Writes the number of reference sequences and then the sequences themselves + */ + private void writeReferenceSequences() { + SAMFileReader sam = new SAMFileReader(this.seqDictionaryFile); + this.dictionary = sam.getFileHeader().getSequences(); + kvCodec.encode(this.dictionary, codec); + } + + /** + * Writes all the dbSnp records to the file in the order of the reference sequences + * in the sequence dictionary file. + */ + private void writeDbSnpRecords() { + sequenceToSnps = new HashMap>(); + int count = 0; + + TabbedTextFileParser parser = new TabbedTextFileParser(true, snpFile); + while(parser.hasNext()) { + String parts[] = parser.next(); + String sequence = parts[1]; + + // If we don't have this sequence in our dictionary, ignore it + if (!getSequenceToIndex().containsKey(sequence)) { + continue; + } + + int start = Integer.parseInt(parts[2]) + 1; // We go from a zero-based to a 1-based system. + int end = Integer.parseInt(parts[3]); + + String var = parts[11]; + + // We only care about SNPs, insertions, and deletions; otherwise skip it + VariantType type = null; + if (var.equals(snp)) { + type = VariantType.SNP; + end = start; // For SNPs, we mark the start and end as the same location + } + // For insertions and deletions, we mark the base on either side of the affected reference sequence + else if (var.equals(insertion)) { + type = VariantType.insertion; + end = start + 1; // Insertions are always length 1 + } + else if (var.equals(deletion)) { + type = VariantType.deletion; + start = start - 1; + end++; + } + else if (var.equals(indel)) { // For indels, we do one each of an insertion (here) and a deletion (below) + type = VariantType.insertion; + start = start - 1; + end = start + 1; + } + else { + continue; + } + + if (!sequenceToSnps.containsKey(sequence)) { + sequenceToSnps.put(sequence, new TreeSet()); + } + SortedSet sequenceVars = sequenceToSnps.get(sequence); + + boolean validated = !parts[12].equals("unknown"); + String name = parts[4]; + + sequenceVars.add(new KnownVariant(name, getSequenceToIndex().get(sequence), start, end, type, validated)); + count++; + + // If it's an in-del, we add it as a deletion (in addition to the insertion we also added) so we + // will have two records in our binary format for the one record in the text file + if (var.equals(indel)) { + sequenceVars.add(new KnownVariant(name, getSequenceToIndex().get(sequence), start, + Integer.parseInt(parts[3])+1, VariantType.deletion, validated)); + count++; + } + } + + codec.writeInt(count); + // Loop through the sequences from the sequence dictionary in order + for (int i = 0; i < dictionary.size(); i++) { + // And write their known variants in order + if (sequenceToSnps.containsKey(dictionary.get(i).getSequenceName())) { + for (Iterator it = sequenceToSnps.get(dictionary.get(i).getSequenceName()).iterator(); + it.hasNext();) { + kvCodec.encode(it.next(), codec); + } + } + } + log.info("Wrote " + count + " dbSnp records."); + } + + /** + * Returns the map of sequences to their index in the reference dictionary, + * creating it if it does not already exist + * + * @return the map of sequences to their index in the reference dictionary + */ + private Map getSequenceToIndex() { + if (sequenceToIndex.keySet().size() == 0) { + for (int i = 0; i < dictionary.size(); i++) { + sequenceToIndex.put(dictionary.get(i).getSequenceName(), i); + } + } + return sequenceToIndex; + } + +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java b/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java new file mode 100644 index 0000000000..dbee370d29 --- /dev/null +++ b/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java @@ -0,0 +1,149 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.variation; + +import java.io.*; +import java.util.*; +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.io.IoUtil; + +/** + * Reader for DbSnp binary files. See DbSnpFileGenerator for file format. + */ +public class DbSnpFileReader implements KnownVariantIterator +{ + private BinaryCodec codec = null; + private KnownVariantCodec kvCodec = new KnownVariantCodec(); + List dictionary; + private Map refIndexToName = new HashMap(); + private KnownVariant next = null; + private int dbSnpCount = -1; + + /** + * Constructor + * + * @param dbSnpFile The binary dbSnp file to read + */ + public DbSnpFileReader(File dbSnpFile) + { + codec = new BinaryCodec(new DataInputStream(IoUtil.openFileForReading(dbSnpFile))); + readHeader(); + next = readNextDbSnp(); + } + + /** + * Returns an iterator over a set of elements of type KnownVariant. + * + * @return an Iterator + */ + public Iterator iterator() + { + return this; + } + + /** + * Returns true if the iteration has more elements. + * + * @return true if the iterator has more elements. + */ + public boolean hasNext() + { + return next != null; + } + + /** + * Returns the next element in the iteration. + * + * @return the next KnownVariant in the iteratoion + */ + public KnownVariant next() + { + if (!hasNext()) throw new NoSuchElementException(); + KnownVariant result = next; + next = readNextDbSnp(); + return result; + } + + /** Allows peeking at the next value without advaning the iterator. */ + public KnownVariant peek() { + return this.next; + } + + /** + * Not supported. + * + * @throws UnsupportedOperationException + */ + public void remove() + { + throw new UnsupportedOperationException("Remove() not supported."); + } + + /** + * Closes the underlying stream, via the BinaryCodec's close() method + */ + public void close() + { + codec.close(); + } + + /** + * Reads the header data from the binary file, validates the version, and populates refIndexToName + * + * @throws IOException + */ + private void readHeader() + { + // Verify that we are using the correct version + String ver = kvCodec.decodeMagicNumber(codec); + if (!ver.equals(KnownVariantCodec.MAGIC_NUMBER)) + { + throw new RuntimeException("Unsupported dbSnp file version: " + ver); + } + + // Read the number of reference sequences and then the sequences themselves + dictionary = kvCodec.decodeSequenceDictionary(codec); + for (int i = 0; i < dictionary.size(); i++) + { + refIndexToName.put(i, dictionary.get(i)); + } + + dbSnpCount = codec.readInt(); + } + + /** + * Reads the next dbSnp record from the binary file + * + * @return the populated KnownVariant object + */ + private KnownVariant readNextDbSnp() { + KnownVariant kv = kvCodec.decodeKnownVariant(codec); + if (kv != null) { + kv.setRefrenceSequence(refIndexToName.get(kv.getSequenceIndex()).getSequenceName()); + } + return kv; + } + + /** + * Returns the SequenceDictionary for this file in SAM format + * + * @return an ordered List of SAMSequenceRecords + */ + public List getSequenceDictionary() { return dictionary; } + + /** + * Returns the total number of dbSnp records encoded in the file + * + * @return total dbSnps encoded in the file + */ + public int getCountDbSnpRecords() { return dbSnpCount; } +} \ No newline at end of file diff --git a/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java b/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java new file mode 100644 index 0000000000..65c8570b6e --- /dev/null +++ b/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java @@ -0,0 +1,51 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.variation; + +import java.io.File; + +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.cmdline.Usage; + +/** + * CommandLineProgram to generate to invoke DbSnpFileGenerator + * + * @author Kathleen Tibbetts + */ +public class GenerateDbSnpFile extends CommandLineProgram +{ + // The following attributes define the command-line arguments + @Usage(programVersion="1.0") + public String USAGE = + "Usage: " + getClass().getName() + " [options]\n\n" + + "Generate a KnownVariant binary file from a UCSC DbSnp text file.\n"; + + @Option(shortName = "S", doc = "UCSC SNP file. ") + public File SNP_FILE; + + @Option(shortName = "D", doc = "Sequence Dictionary for the genome in SAM or BAM format. ") + public File SEQUENCE_DICTIONARY; + + @Option(shortName = "O", doc = "The binary output file. ") + public File OUTPUT; + + @Override + protected int doWork() { + DbSnpFileGenerator generator = new DbSnpFileGenerator(SNP_FILE, SEQUENCE_DICTIONARY, OUTPUT); + generator.writeDbSnpFile(); + return 0; + } + + public static void main(String[] argv) { + System.exit(new GenerateDbSnpFile().instanceMain(argv)); + } + +} diff --git a/lib/edu/mit/broad/picard/variation/KnownVariant.java b/lib/edu/mit/broad/picard/variation/KnownVariant.java new file mode 100644 index 0000000000..26b2f33be6 --- /dev/null +++ b/lib/edu/mit/broad/picard/variation/KnownVariant.java @@ -0,0 +1,115 @@ +package edu.mit.broad.picard.variation; + +/** + * Utility class to hold data about a population or somatic variant. + * + * IMPORTANT! Regardless of the coordinate system of the data from which it is drawn, the data + * in this class should be 1-based. Start and end coordinates should be as follows: + * For SNPs, start and end should be the same base. + * For insertions and deletions, the base on either side of the affected reference sequence + * will be the start and end. For insertions, this means they will always be 1 base apart. + */ +public class KnownVariant implements Comparable +{ + private final String name; + private final int sequenceIndex; + private final int startPos; + private final int endPos; + private final VariantType type; + private final boolean validated; + private transient String referenceSequence; + + /** + * Constructor + * + * @param name + * @param sequenceIndex + * @param startPos + * @param endPos + * @param type + * @param validated + */ + public KnownVariant(String name, int sequenceIndex, int startPos, int endPos, + VariantType type, boolean validated) + { + this.name = name; + this.sequenceIndex = sequenceIndex; + this.startPos = startPos; + this.endPos = endPos; + this.type = type; + this.validated = validated; + } + + /** + * Compares this object with the specified object for order. Returns a negative integer, zero, or a positive + * integer as this object is less than, equal to, or greater than the specified object. + * + * @param that The KnownVariant to compare + * @return a negative integer, zero, or a positive integer as this object is less than, equal to, + * or greater than the specified object + */ + public int compareTo(KnownVariant that) + { + if (this.getSequenceIndex() != that.getSequenceIndex()) + { + return (this.getSequenceIndex() > that.getSequenceIndex()) ? 1 : -1; + } + else if (this.getStartPos() != that.getStartPos()) + { + return (this.getStartPos() > that.getStartPos()) ? 1 : -1; + } + else if (this.getEndPos() != that.getEndPos()) + { + return (this.getEndPos() > that.getEndPos()) ? 1 : -1; + } + else if (!this.getName().equals(that.getName())) + { + return this.getName().compareTo(that.getName()); + } + else if (this.getType() != that.getType()) + { + return this.getType().compareTo(that.getType()); + } + else if (this.isValidated() != that.isValidated()) + { + return this.isValidated() ? 1 : -1; + } + return 0; + } + + public boolean equals(Object o) + { + if (!(o instanceof KnownVariant)) { + return false; + } + KnownVariant that = (KnownVariant)o; + return (this.name.equals(that.name) && + this.sequenceIndex == that.sequenceIndex && + this.startPos == that.startPos && + this.endPos == that.endPos && + this.type == that.type && + this.validated == that.validated); + } + + public int hasCode() + { + int result = 17; + result = 37*result + name.hashCode(); + result = 37*result + sequenceIndex; + result = 37*result + startPos; + result = 37*result + endPos; + result = 37*result + type.hashCode(); + result = 37*result + (validated ? 1 : 0); + return result; + } + + public String getName() { return name; } + public int getSequenceIndex() { return sequenceIndex; } + public String getRefrenceSequence() { return referenceSequence; } + public void setRefrenceSequence(String referenceSequence) { this.referenceSequence = referenceSequence; } + public int getStartPos() { return startPos; } + public int getEndPos() { return endPos; } + public VariantType getType() { return type; } + public boolean isValidated() { return validated; } + +} diff --git a/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java b/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java new file mode 100644 index 0000000000..2258e756cb --- /dev/null +++ b/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java @@ -0,0 +1,179 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.variation; + +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.RuntimeEOFException; + +import java.util.ArrayList; +import java.util.List; + +/** + * Class for encoding and deconding binary data about KnownVariants + * + * IMPORTANT! This class assumes that a KnownVariant instance is 1-based and end-inclusive + * and that the binary format is 0-based and end-exclusive. + * + * The format for the binary dbSnp file is as follows: + * + * Field Description Type Value + * ----- ----------- ---- ----- + * magic Known variant magic number char[4] DBS\1 + * n_ref # reference sequences int32 + * + * -- List of references information (n = n_ref) + * l_name length of the reference name plus 1 (including NULL) int32 + * name Name; NULL terminated char[l_name] + * t_ref Length of the reference sequence int32 + * + * + * n_snps # of Known Variant records int32 + * + * -- List of DBSnps + * block_size Length of the remainder of the block + * rID Reference sequence ID (-1 <= rId <= n_ref) int32 + * pos 0-based leftmost coordinate int32 + * snp_len Length of the dbSnp int32 + * type type of SNP int8 0 = deletion + * 1 = het + * 2 = in-del + * 3 = insertion + * 4 = microsatellite + * 5 = mixed + * 6 = mnp + * 7 = named + * 8 = single + * 9 = unknown + * validated whether the SNP has been validated int8 1 | 0 + * name name of the dbSnp; NULL terminated char[block_size-15] + * + * @author Kathleen Tibbetts + **/ +public class KnownVariantCodec +{ + public static final String MAGIC_NUMBER = "DBS\1"; + private static final int KV_RECORD_LENGTH_LESS_NAME = 15; + + /** + * Reads data about a known variant from the BinaryCodec and instantiates a KnownVariant + * object with those values + * + * @param codec The BinaryCodec from which to read + * @return a populated KnownVariant object + */ + public KnownVariant decodeKnownVariant(BinaryCodec codec) + { + int blockSize; + try { + blockSize = codec.readInt(); + } + catch (RuntimeEOFException e) { + return null; + } + int seqIndex = codec.readInt(); + int startPos = codec.readInt() + 1; // Switch to 1-based + int endPos = codec.readInt(); + byte[] buffer = new byte[1]; + codec.readBytes(buffer); + VariantType type = VariantType.getVariantTypeFromOrdinal((int) buffer[0]); + codec.readBytes(buffer); + boolean validated = ((int) buffer[0]) == 1; + String name = codec.readString(blockSize - KV_RECORD_LENGTH_LESS_NAME); + codec.readBytes(buffer); // Skip the null terminator + return new KnownVariant(name, seqIndex, startPos, endPos, type, validated); + + } + + /** + * Writes data from a KnownVariant in the expected format to the BinaryCodec + * + * @param variant The KnownVariant to encode + * @param codec The BinaryCodec to which to write + */ + public void encode(KnownVariant variant, BinaryCodec codec) + { + codec.writeInt(variant.getName().length() + KV_RECORD_LENGTH_LESS_NAME);// Length of the rest of the block + codec.writeInt(variant.getSequenceIndex()); // Index of the reference sequence + codec.writeInt((int)variant.getStartPos()-1); // Switch to 0-based leftmost coordinate + codec.writeInt((int)variant.getEndPos()); // end position, exclusive + byte b[] = new byte[1]; + b[0] = (byte)variant.getType().ordinal(); // Type + codec.writeBytes(b); + b[0] = (byte)(variant.isValidated() ? 1 : 0); // Validated + codec.writeBytes(b); + codec.writeString(variant.getName(), false, true); // The null-terminated name + } + + /** + * Reads data about the Sequence Dictionary from the BinaryCodec and instantiates a List of + * SAMSequenceRecords with those values + * + * @param codec The BinaryCodec from which to read + * @return a populated List of SAMSequenceRecords + */ + public List decodeSequenceDictionary(BinaryCodec codec) + { + int total = codec.readInt(); + List dictionary = new ArrayList(total); + for (int i = 0; i < total; i++) + { + int len = codec.readInt(); + // Read the name, leaving off and then skipping the null terminator + String name = codec.readString(len-1); + byte[] buffer = new byte[1]; + codec.readBytes(buffer); + int seqLength = codec.readInt(); + SAMSequenceRecord rec = new SAMSequenceRecord(name); + rec.setSequenceLength(seqLength); + dictionary.add(rec); + } + return dictionary; + } + + /** + * Writes a Sequence Dictionary in the format excpected to the BinaryCodec + * + * @param dictionary The list of SAMSequenceRecords to encode + * @param codec The BinaryCodec to which to write + */ + public void encode(List dictionary, BinaryCodec codec) + { + codec.writeInt(dictionary.size()); + for (SAMSequenceRecord sequence : dictionary) + { + codec.writeString(sequence.getSequenceName(), true, true); + codec.writeInt(sequence.getSequenceLength()); + } + + } + + /** + * Reads data about the Magic Number from the BinaryCodec and returns a string with its value + * + * @param codec The BinaryCodec from which to read + * @return a Magic Number + */ + public String decodeMagicNumber(BinaryCodec codec) + { + return codec.readString(4); + } + + /** + * Writes a Magic Number in the format excpected to the BinaryCodec + * + * @param magicNumber The magic number to encode + * @param codec The BinaryCodec to which to write + */ + public void encode(String magicNumber, BinaryCodec codec) + { + codec.writeString(magicNumber, false, false); + } +} diff --git a/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java b/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java new file mode 100644 index 0000000000..6cb0712e14 --- /dev/null +++ b/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java @@ -0,0 +1,31 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.variation; + +import edu.mit.broad.sam.SAMSequenceRecord; + +import java.util.Iterator; +import java.util.List; + +/** + * API for iterating over records representing known variations + * + * @author Kathleen Tibbetts + */ +public interface KnownVariantIterator extends Iterable, Iterator +{ + /** + * Return the list of sequence dictionary (list of SAMSequenceRecords in order) + * for this KnownVariantIterator + * + * @return The SAMSequenceRecords that comprise the sequence dictionary for this iterator, in order + */ + public List getSequenceDictionary(); +} diff --git a/lib/edu/mit/broad/picard/variation/VariantType.java b/lib/edu/mit/broad/picard/variation/VariantType.java new file mode 100644 index 0000000000..354e047230 --- /dev/null +++ b/lib/edu/mit/broad/picard/variation/VariantType.java @@ -0,0 +1,30 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.picard.variation; + +/** + * Enum to hold the possible types of dbSnps. Note that these correspsond to the names used + * in the dbSnp database with the exception of indel (which is in-del in dbSnp). + */ +public enum VariantType +{ + SNP, insertion, deletion; + + /** + * Gets the enum for a given ordinal + * + * @param ordinal + * @return VariantType + */ + public static VariantType getVariantTypeFromOrdinal(int ordinal) + { + return VariantType.class.getEnumConstants()[ordinal]; + } +} diff --git a/lib/edu/mit/broad/sam/AlignmentBlock.java b/lib/edu/mit/broad/sam/AlignmentBlock.java new file mode 100644 index 0000000000..ef1ec841c2 --- /dev/null +++ b/lib/edu/mit/broad/sam/AlignmentBlock.java @@ -0,0 +1,31 @@ +package edu.mit.broad.sam; + +/** + * Represents the contiguous alignment of a subset of read bases to a reference + * sequence. Simply put an alignment block tells you that read bases from + * readStart are aligned to the reference (matching or mismatching) from + * referenceStart for length bases. + * + * @author Tim Fennell + */ +public class AlignmentBlock { + private int readStart; + private int referenceStart; + private int length; + + /** Constructs a new alignment block with the supplie read and ref starts and length. */ + AlignmentBlock(int readStart, int referenceStart, int length) { + this.readStart = readStart; + this.referenceStart = referenceStart; + this.length = length; + } + + /** The first, 1-based, base in the read that is aligned to the reference reference. */ + public int getReadStart() { return readStart; } + + /** The first, 1-based, position in the reference to which the read is aligned. */ + public int getReferenceStart() { return referenceStart; } + + /** The number of contiguous bases aligned to the reference. */ + public int getLength() { return length; } +} diff --git a/lib/edu/mit/broad/sam/BAMFileConstants.java b/lib/edu/mit/broad/sam/BAMFileConstants.java new file mode 100644 index 0000000000..7b5cf6c70c --- /dev/null +++ b/lib/edu/mit/broad/sam/BAMFileConstants.java @@ -0,0 +1,33 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +class BAMFileConstants { + /** + * The beginning of a BAMRecord is a fixed-size block of 8 int32s + */ + static final int FIXED_BLOCK_SIZE = 8 * 4; + + /** + * Sanity check -- we never expect BAMRecords to be as big as this. + */ + static final int MAXIMUM_RECORD_LENGTH = 1024 * 1024; + + /** + * BAM file magic number. This is what is present in the gunzipped version of the file, + * which never exists on disk. + */ + + static final byte[] BAM_MAGIC = "BAM\1".getBytes(); + /** + * BAM index file magic number. + */ + static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); +} diff --git a/lib/edu/mit/broad/sam/BAMFileIndex.java b/lib/edu/mit/broad/sam/BAMFileIndex.java new file mode 100644 index 0000000000..d6624b76d1 --- /dev/null +++ b/lib/edu/mit/broad/sam/BAMFileIndex.java @@ -0,0 +1,277 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import edu.mit.broad.sam.util.RuntimeEOFException; +import edu.mit.broad.sam.util.RuntimeIOException; + +import java.io.*; +import java.nio.*; +import java.nio.channels.*; +import java.util.*; + +/** + * Internal class for reading BAM file indexes. + */ +class BAMFileIndex +{ + private static final int MAX_BINS = 37450; // =(8^6-1)/7+1 + private static final int BAM_LIDX_SHIFT = 16; + + private File mFile = null; + private FileInputStream mFileStream = null; + private MappedByteBuffer mFileBuffer = null; + + + BAMFileIndex(final File file) { + mFile = file; + } + + void close() { + closeFileStream(); + } + + long[] getSearchBins(int referenceIndex, int startPos, int endPos) { + + openIndex(); + seek(4); + + int sequenceCount = readInteger(); + // System.out.println("# Sequence count: " + sequenceCount); + if (referenceIndex >= sequenceCount) { + return null; + } + + BitSet regionBins = regionToBins(startPos, endPos); + if (regionBins == null) { + return null; + } + + for (int i = 0; i < referenceIndex; i++) { + // System.out.println("# Sequence TID: " + i); + int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j = 0; j < nBins; j++) { + int bin = readInteger(); + int nChunks = readInteger(); + // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); + skipBytes(16 * nChunks); + } + int nLinearBins = readInteger(); + // System.out.println("# nLinearBins: " + nLinearBins); + skipBytes(8 * nLinearBins); + } + + // System.out.println("# Sequence target TID: " + referenceIndex); + int nIndexBins = readInteger(); + // System.out.println("# nBins: " + nIndexBins); + if (nIndexBins == 0) { + return null; + } + + List chunkList = new ArrayList(); + for (int i = 0; i < nIndexBins; i++) { + int indexBin = readInteger(); + int nChunks = readInteger(); + // System.out.println("# bin[" + i + "] = " + indexBin + ", nChunks = " + nChunks); + if (regionBins.get(indexBin)) { + for (int ci = 0; ci < nChunks; ci++) { + long chunkBegin = readLong(); + long chunkEnd = readLong(); + chunkList.add(new Chunk(chunkBegin, chunkEnd)); + } + } else { + skipBytes(16 * nChunks); + } + } + + if (chunkList.isEmpty()) { + return null; + } + + int start = (startPos <= 0) ? 0 : startPos-1; + int regionLinearBin = start >> BAM_LIDX_SHIFT; + int nLinearBins = readInteger(); + // System.out.println("# nLinearBins: " + nLinearBins); + // System.out.println("# regionLinearBin: " + regionLinearBin); + long minimumOffset = 0; + if (regionLinearBin < nLinearBins) { + skipBytes(8 * regionLinearBin); + minimumOffset = readLong(); + } + chunkList = optimizeChunkList(chunkList, minimumOffset); + return convertToArray(chunkList); + } + + private List optimizeChunkList(List chunkList, long minimumOffset) { + Chunk lastChunk = null; + Collections.sort(chunkList); + List result = new ArrayList(); + for (Chunk chunk : chunkList) { + if (chunk.getChunkEnd() <= minimumOffset) { + continue; + } + if (result.isEmpty()) { + result.add(chunk); + lastChunk = chunk; + continue; + } + // Coalesce chunks that are in adjacent file blocks. + // This is a performance optimization. + long lastFileBlock = getFileBlock(lastChunk.getChunkEnd()); + long chunkFileBlock = getFileBlock(chunk.getChunkStart()); + if (chunkFileBlock - lastFileBlock > 1) { + result.add(chunk); + lastChunk = chunk; + } else { + if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) { + lastChunk.setChunkEnd(chunk.getChunkEnd()); + } + } + } + return result; + } + + private long[] convertToArray(List chunkList) { + int count = chunkList.size() * 2; + if (count == 0) { + return null; + } + int index = 0; + long[] result = new long[count]; + for (Chunk chunk : chunkList) { + result[index++] = chunk.getChunkStart(); + result[index++] = chunk.getChunkEnd(); + } + return result; + } + + private BitSet regionToBins(int startPos, int endPos) { + int maxPos = 0x1FFFFFFF; + int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; + int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; + if (start > end) { + return null; + } + int k; + BitSet bitSet = new BitSet(MAX_BINS); + bitSet.set(0); + for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); + for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); + for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); + for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); + for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); + return bitSet; + } + + private long getFileBlock(long bgzfOffset) { + return ((bgzfOffset >> 16L) & 0xFFFFFFFFFFFFL); + } + + private void openIndex() { + if (mFileBuffer != null) { + return; + } + openFileStream(); + seek(0); + byte[] buffer = new byte[4]; + readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_INDEX_MAGIC)) { + closeFileStream(); + throw new RuntimeException("Invalid file header in BAM index " + mFile + + ": " + new String(buffer)); + } + } + + private void readBytes(byte[] buffer) { + mFileBuffer.get(buffer); + } + + private int readInteger() { + return mFileBuffer.getInt(); + } + + private long readLong() { + return mFileBuffer.getLong(); + } + + private void skipBytes(int count) { + mFileBuffer.position(mFileBuffer.position() + count); + } + + private void seek(int position) { + mFileBuffer.position(position); + } + + private void openFileStream() { + if (mFileStream != null) { + return; + } + try { + mFileStream = new FileInputStream(mFile); + FileChannel channel = mFileStream.getChannel(); + mFileBuffer = channel.map(FileChannel.MapMode.READ_ONLY, 0L, channel.size()); + mFileBuffer.order(ByteOrder.LITTLE_ENDIAN); + } catch (IOException exc) { + throw new RuntimeIOException(exc.getMessage(), exc); + } + } + + private void closeFileStream() { + if (mFileStream == null) { + return; + } + try { + mFileStream.close(); + } catch (IOException exc) { + throw new RuntimeIOException(exc.getMessage(), exc); + } + mFileStream = null; + mFileBuffer = null; + } + + private static class Chunk + implements Comparable { + + private long mChunkStart; + private long mChunkEnd; + + Chunk(long start, long end) { + mChunkStart = start; + mChunkEnd = end; + } + + long getChunkStart() { + return mChunkStart; + } + + void setChunkStart(long value) { + mChunkStart = value; + } + + long getChunkEnd() { + return mChunkEnd; + } + + void setChunkEnd(long value) { + mChunkEnd = value; + } + + public int compareTo(Chunk chunk) { + int result = Long.signum(mChunkStart - chunk.mChunkStart); + if (result == 0) { + result = Long.signum(mChunkEnd - chunk.mChunkEnd); + } + return result; + } + } +} diff --git a/lib/edu/mit/broad/sam/BAMFileReader.java b/lib/edu/mit/broad/sam/BAMFileReader.java new file mode 100644 index 0000000000..4e81fc0170 --- /dev/null +++ b/lib/edu/mit/broad/sam/BAMFileReader.java @@ -0,0 +1,317 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.BlockCompressedInputStream; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.StringLineReader; + +import java.io.DataInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Internal class for reading and querying BAM files. + */ +class BAMFileReader + extends SAMFileReader.ReaderImplementation { + private boolean mIsSeekable = false; + private BinaryCodec mStream = null; + private final BlockCompressedInputStream mCompressedInputStream; + private SAMFileHeader mFileHeader = null; + private BAMFileIndex mFileIndex = null; + private long mFirstRecordPointer = 0; + private CloseableIterator mCurrentIterator = null; + private final boolean eagerDecode; + + + BAMFileReader(final InputStream stream, final boolean eagerDecode) + throws IOException { + mIsSeekable = false; + mCompressedInputStream = new BlockCompressedInputStream(stream); + mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); + this.eagerDecode = eagerDecode; + readHeader(null); + } + + BAMFileReader(final File file, final boolean eagerDecode) + throws IOException { + mIsSeekable = true; + mCompressedInputStream = new BlockCompressedInputStream(file); + mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); + this.eagerDecode = eagerDecode; + readHeader(file); + mFirstRecordPointer = mCompressedInputStream.getFilePointer(); + } + + void close() { + if (mStream != null) { + mStream.close(); + } + mStream = null; + mFileHeader = null; + mFileIndex = null; + } + + BAMFileIndex getFileIndex() { + return mFileIndex; + } + + void setFileIndex(final BAMFileIndex fileIndex) { + mFileIndex = fileIndex; + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + /** + * Currently this is ignored for BAM reading. Always do strict validation. + */ + void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { + } + + CloseableIterator getIterator() { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (mIsSeekable) { + try { + mCompressedInputStream.seek(mFirstRecordPointer); + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + mCurrentIterator = new BAMFileIterator(); + return mCurrentIterator; + } + + CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + if (mStream == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mCurrentIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + if (!mIsSeekable) { + throw new UnsupportedOperationException("Cannot query stream-based BAM file"); + } + if (mFileIndex == null) { + throw new IllegalStateException("No BAM file index is available"); + } + mCurrentIterator = new BAMFileIndexIterator(sequence, start, end, contained); + return mCurrentIterator; + } + + private void readHeader(final File file) + throws IOException { + + final byte[] buffer = new byte[4]; + mStream.readBytes(buffer); + if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { + throw new IOException("Invalid BAM file header"); + } + + final int headerTextLength = mStream.readInt(); + final String textHeader = mStream.readString(headerTextLength); + mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader), + file); + + final int sequenceCount = mStream.readInt(); + if (mFileHeader.getSequences().size() > 0) { + // It is allowed to have binary sequences but no text sequences, so only validate if both are present + if (sequenceCount != mFileHeader.getSequences().size()) { + throw new SAMFormatException("Number of sequences in text header (" + mFileHeader.getSequences().size() + + ") != number of sequences in binary header (" + sequenceCount + ") for file " + file); + } + for (int i = 0; i < sequenceCount; i++) { + final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(file); + final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); + if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + + file); + } + if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { + throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + + file); + } + } + } else { + // If only binary sequences are present, copy them into mFileHeader + final List sequences = new ArrayList(sequenceCount); + for (int i = 0; i < sequenceCount; i++) { + sequences.add(readSequenceRecord(file)); + } + mFileHeader.setSequences(sequences); + } + } + + private SAMSequenceRecord readSequenceRecord(final File file) { + final int nameLength = mStream.readInt(); + if (nameLength <= 1) { + throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + file); + } + final String sequenceName = mStream.readString(nameLength - 1); + // Skip the null terminator + mStream.readByte(); + final int sequenceLength = mStream.readInt(); + final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName); + record.setSequenceLength(sequenceLength); + return record; + } + + private class BAMFileIterator + implements CloseableIterator { + + private SAMRecord mNextRecord = null; + private final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(getFileHeader()); + + + BAMFileIterator() { + this(true); + } + + BAMFileIterator(final boolean advance) { + this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); + + if (advance) { + advance(); + } + } + + public void close() { + if (this != mCurrentIterator) { + throw new IllegalStateException("Attempt to close non-current iterator"); + } + mCurrentIterator = null; + } + + public boolean hasNext() { + return (mNextRecord != null); + } + + public SAMRecord next() { + final SAMRecord result = mNextRecord; + advance(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + void advance() { + try { + mNextRecord = getNextRecord(); + if (eagerDecode && mNextRecord != null) { + mNextRecord.eagerDecode(); + } + } catch (IOException exc) { + throw new RuntimeException(exc.getMessage(), exc); + } + } + + SAMRecord getNextRecord() + throws IOException { + return bamRecordCodec.decode(); + } + } + + private class BAMFileIndexIterator + extends BAMFileIterator { + + private long[] mFilePointers = null; + private int mFilePointerIndex = 0; + private long mFilePointerLimit = -1; + private int mReferenceIndex = -1; + private int mRegionStart = 0; + private int mRegionEnd = 0; + private boolean mReturnContained = false; + + + BAMFileIndexIterator(final String sequence, final int start, final int end, final boolean contained) { + super(false); // delay advance() until after construction + final SAMFileHeader fileHeader = getFileHeader(); + mReferenceIndex = fileHeader.getSequenceIndex(sequence); + if (mReferenceIndex != -1) { + final BAMFileIndex fileIndex = getFileIndex(); + mFilePointers = fileIndex.getSearchBins(mReferenceIndex, start, end); + } + mRegionStart = start; + mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; + mReturnContained = contained; + advance(); + } + + SAMRecord getNextRecord() + throws IOException { + while (true) { + // Advance to next file block if necessary + while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) { + if (mFilePointers == null || + mFilePointerIndex >= mFilePointers.length) { + return null; + } + final long startOffset = mFilePointers[mFilePointerIndex++]; + final long endOffset = mFilePointers[mFilePointerIndex++]; + mCompressedInputStream.seek(startOffset); + mFilePointerLimit = endOffset; + } + // Pull next record from stream + final SAMRecord record = super.getNextRecord(); + if (record == null) { + return null; + } + // If beyond the end of this reference sequence, end iteration + final int referenceIndex = record.getReferenceIndex(); + if (referenceIndex != mReferenceIndex) { + if (referenceIndex < 0 || + referenceIndex > mReferenceIndex) { + mFilePointers = null; + return null; + } + // If before this reference sequence, continue + continue; + } + if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { + // Quick exit to avoid expensive alignment end calculation + return record; + } + final int alignmentStart = record.getAlignmentStart(); + final int alignmentEnd = record.getAlignmentEnd(); + if (alignmentStart > mRegionEnd) { + // If scanned beyond target region, end iteration + mFilePointers = null; + return null; + } + // Filter for overlap with region + if (mReturnContained) { + if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { + return record; + } + } else { + if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { + return record; + } + } + } + } + } +} diff --git a/lib/edu/mit/broad/sam/BAMFileWriter.java b/lib/edu/mit/broad/sam/BAMFileWriter.java new file mode 100644 index 0000000000..6a7bf7d9ba --- /dev/null +++ b/lib/edu/mit/broad/sam/BAMFileWriter.java @@ -0,0 +1,64 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.BlockCompressedOutputStream; + +import java.io.DataOutputStream; +import java.io.File; + +/** + * Concrete implementation of SAMFileWriter for writing gzipped BAM files. + */ +class BAMFileWriter extends SAMFileWriterImpl { + + private final BinaryCodec outputBinaryCodec; + private BAMRecordCodec bamRecordCodec = null; + + public BAMFileWriter(final File path) { + outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(path))); + outputBinaryCodec.setOutputFileName(path.toString()); + } + + private void prepareToWriteAlignments() { + if (bamRecordCodec == null) { + bamRecordCodec = new BAMRecordCodec(getHeader()); + bamRecordCodec.setOutputStream(outputBinaryCodec.getOutputStream()); + } + } + + protected void writeAlignment(final SAMRecord alignment) { + prepareToWriteAlignments(); + bamRecordCodec.encode(alignment); + } + + protected void writeHeader(final String textHeader) { + outputBinaryCodec.writeBytes(BAMFileConstants.BAM_MAGIC); + + // calculate and write the length of the SAM file header text and the header text + outputBinaryCodec.writeString(textHeader, true, false); + + // write the sequences binarily. This is redundant with the text header + outputBinaryCodec.writeInt(getHeader().getSequences().size()); + for (final SAMSequenceRecord sequenceRecord: getHeader().getSequences()) { + outputBinaryCodec.writeString(sequenceRecord.getSequenceName(), true, true); + outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength()); + } + } + + protected void finish() { + outputBinaryCodec.close(); + } + + protected String getFilename() { + return outputBinaryCodec.getOutputFileName(); + } +} diff --git a/lib/edu/mit/broad/sam/BAMRecord.java b/lib/edu/mit/broad/sam/BAMRecord.java new file mode 100644 index 0000000000..1ae5c0f3f8 --- /dev/null +++ b/lib/edu/mit/broad/sam/BAMRecord.java @@ -0,0 +1,280 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.StringUtil; + +import java.io.ByteArrayInputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + + +/** + * Wrapper class for binary BAM records. + * Delays unpacking all data binary until requested. + */ +class BAMRecord + extends SAMRecord +{ + private static final int READ_NAME_OFFSET = 0; + + private byte[] mRestOfBinaryData = null; + private int mReadLength = 0; + private final short mReadNameLength; + private final int mCigarLen; + private boolean mAttributesDecoded = false; + private boolean mCigarDecoded = false; + + /** + * If any of the properties set from mRestOfBinaryData have been overridden by calls to setters, + * this is set to true, indicating that mRestOfBinaryData cannot be used to write this record to disk. + */ + private boolean mBinaryDataStale; + + BAMRecord(final SAMFileHeader header, final int referenceID, final int coordinate, final short readNameLength, final short mappingQuality, + final int indexingBin, final int cigarLen, final int flags, final int readLen, final int mateReferenceID, final int mateCoordinate, final int insertSize, + final byte[] restOfData) { + setReferenceIndex(referenceID, header); + setAlignmentStart(coordinate); + mReadNameLength = readNameLength; + setMappingQuality(mappingQuality); + setIndexingBin(indexingBin); + mCigarLen = cigarLen; + setFlags(flags); + mReadLength = readLen; + setMateReferenceIndex(mateReferenceID, header); + setMateAlignmentStart(mateCoordinate); + setInferredInsertSize(insertSize); + mRestOfBinaryData = restOfData; + + // Set these to null in order to mark them as being candidates for lazy initialization. + // If this is not done, they will have non-null defaults. + super.setReadName(null); + super.setCigarString(null); + super.setReadBases(null); + super.setBaseQualities(null); + + // Mark the binary block as being valid for writing back out to disk + mBinaryDataStale = false; + } + + protected void eagerDecode() { + // Force all the lazily-initialized attributes to be decoded. + getReadName(); + getCigar(); + getReadBases(); + getBaseQualities(); + getAttributes(); + super.eagerDecode(); + mRestOfBinaryData = null; + } + + /** + * If this record has a valid binary representation of the variable-length portion of a binary record stored, + * return that byte array, otherwise return null. This will never be true for SAMRecords. It will be true + * for BAMRecords that have not been eagerDecoded(), and for which none of the data in the variable-length + * portion has been changed. + */ + @Override + public byte[] getVariableBinaryRepresentation() { + if (mBinaryDataStale) { + return null; + } + // This may have been set to null by eagerDecode() + return mRestOfBinaryData; + } + + /** + * Depending on the concrete implementation, the binary file size of attributes may be known without + * computing them all. + * + * @return binary file size of attribute, if known, else -1 + */ + @Override + public int getAttributesBinarySize() { + if (mBinaryDataStale || mRestOfBinaryData == null) { + return -1; + } + final int tagsOffset = readNameSize() + cigarSize() + basesSize() + qualsSize(); + return mRestOfBinaryData.length - tagsOffset; + } + + @Override + public void setReadName(final String value) { + super.setReadName(value); + mBinaryDataStale = true; + } + + @Override + public void setCigar(final Cigar cigar) { + super.setCigar(cigar); + mBinaryDataStale = true; + } + + @Override + public void setReadBases(final byte[] value) { + super.setReadBases(value); + mBinaryDataStale = true; + } + + @Override + public void setBaseQualities(final byte[] value) { + super.setBaseQualities(value); + mBinaryDataStale = true; + } + + @Override + public void setAttribute(final String key, final Object value) { + // populate all the attributes from the binary block before overwriting one + getAttributes(); + super.setAttribute(key, value); + mBinaryDataStale = true; + } + + /** + * Avoids decoding binary block to get read length + */ + @Override + public int getReadLength() { + return mReadLength; + } + + @Override + public String getReadName() { + String result = super.getReadName(); + if (mRestOfBinaryData != null && result == null) { + result = decodeReadName(); + super.setReadName(result); + } + return result; + } + + /** + * Do not include null terminator + */ + @Override + public int getReadNameLength() { + return mReadNameLength - 1; + } + + @Override + public Cigar getCigar() { + if (mRestOfBinaryData != null && !mCigarDecoded) { + final int cigarOffset = readNameSize(); + final ByteBuffer byteBuffer = ByteBuffer.wrap(mRestOfBinaryData, cigarOffset, cigarSize()); + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + super.setCigar(BinaryCigarCodec.getSingleton().decode(byteBuffer)); + mCigarDecoded = true; + } + return super.getCigar(); + } + + @Override + public int getCigarLength() { + return mCigarLen; + } + + @Override + public byte[] getReadBases() { + byte[] result = super.getReadBases(); + if (mRestOfBinaryData != null && result == null && mReadLength > 0) { + result = decodeReadBases(); + super.setReadBases(result); + } + return result; + } + + @Override + public byte[] getBaseQualities() { + byte[] ret = super.getBaseQualities(); + if (mRestOfBinaryData != null && ret == null && mReadLength > 0) { + ret = decodeBaseQualities(); + super.setBaseQualities(ret); + } + return ret; + } + + @Override + public Object getAttribute(final String key) { + if (!mAttributesDecoded) { + decodeAttributes(); + } + return super.getAttribute(key); + } + + @Override + public Set> getAttributes() { + if (!mAttributesDecoded) { + decodeAttributes(); + } + return super.getAttributes(); + } + + private void decodeAttributes() { + if (mAttributesDecoded) { + return; + } + mAttributesDecoded = true; + final Map attributes = new LinkedHashMap(); + final int tagsOffset = readNameSize() + cigarSize() + basesSize() + qualsSize(); + final int tagsSize = mRestOfBinaryData.length - tagsOffset; + final BinaryCodec byteBufferCodec = new BinaryCodec(new ByteArrayInputStream(mRestOfBinaryData, tagsOffset, tagsSize)); + new BinaryTagCodec(byteBufferCodec).readTags(attributes); + for (final Map.Entry entry : attributes.entrySet()) { + super.setAttribute(entry.getKey(), entry.getValue()); + } + } + + private byte[] decodeBaseQualities() { + if (mReadLength == 0) { + return null; + } + final int qualsOffset = readNameSize() + cigarSize() + basesSize(); + final byte[] ret = new byte[qualsSize()]; + System.arraycopy(mRestOfBinaryData, qualsOffset, ret, 0, qualsSize()); + return ret; + } + + private String decodeReadName() { + // Don't include terminating null + return StringUtil.bytesToString(mRestOfBinaryData, READ_NAME_OFFSET, mReadNameLength-1); + } + + private byte[] decodeReadBases() { + if (mReadLength == 0) { + return null; + } + final int basesOffset = readNameSize() + cigarSize(); + return SAMUtils.compressedBasesToBytes(mReadLength, mRestOfBinaryData, basesOffset); + } + + /* methods for computing size of variably-sizes elements */ + + private int readNameSize() { + return mReadNameLength; + } + + private int cigarSize() { + return mCigarLen * 4; + } + + private int basesSize() { + return (mReadLength + 1)/2; + } + + private int qualsSize() { + return mReadLength; + } +} diff --git a/lib/edu/mit/broad/sam/BAMRecordCodec.java b/lib/edu/mit/broad/sam/BAMRecordCodec.java new file mode 100644 index 0000000000..b73254b522 --- /dev/null +++ b/lib/edu/mit/broad/sam/BAMRecordCodec.java @@ -0,0 +1,163 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.RuntimeEOFException; +import edu.mit.broad.sam.util.SortingCollection; + +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Map; + +public class BAMRecordCodec implements SortingCollection.Codec { + private final BinaryCigarCodec cigarCodec = new BinaryCigarCodec(); + private final SAMFileHeader header; + private OutputStream os; + private InputStream is; + private BinaryCodec binaryCodec; + private BinaryTagCodec binaryTagCodec; + + public BAMRecordCodec(final SAMFileHeader header) { + this.header = header; + } + + public BAMRecordCodec clone() { + BAMRecordCodec other = new BAMRecordCodec(this.header); + return other; + } + + + /** Sets the output stream that records will be written to. */ + public void setOutputStream(final OutputStream os) { + this.os = os; + this.binaryCodec = new BinaryCodec(this.os); + this.binaryTagCodec = new BinaryTagCodec(this.binaryCodec); + } + + /** Sets the input stream that records will be read from. */ + public void setInputStream(final InputStream is) { + this.is = is; + this.binaryCodec = new BinaryCodec(this.is); + this.binaryTagCodec = new BinaryTagCodec(this.binaryCodec); + } + + /** + * Write object to OutputStream. + * The SAMRecord must have a header set into it so reference indices can be resolved. + * + * @param alignment what to write + */ + public void encode(final SAMRecord alignment) { + // Compute block size, as it is the first element of the file representation of SAMRecord + final int readLength = alignment.getReadLength(); + + final int cigarLength = alignment.getCigarLength(); + + int blockSize = BAMFileConstants.FIXED_BLOCK_SIZE + alignment.getReadNameLength() + 1 + // null terminated + cigarLength * 4 + + (readLength + 1) / 2 + // 2 bases per byte + readLength; + + final int attributesSize = alignment.getAttributesBinarySize(); + if (attributesSize != -1) { + blockSize += attributesSize; + } else { + if (alignment.getAttributes() != null) { + for (final Map.Entry attribute : alignment.getAttributes()) { + blockSize += (BinaryTagCodec.getTagSize(attribute.getValue())); + } + } + } + + int indexBin = 0; + if (alignment.getReferenceIndex(header) >= 0) { + if (alignment.getIndexingBin() != null) { + indexBin = alignment.getIndexingBin(); + } else { + indexBin = SAMUtils.reg2bin(alignment.getAlignmentStart() - 1, + alignment.getAlignmentEnd() - 1); + } + } + + // Blurt out the elements + this.binaryCodec.writeInt(blockSize); + this.binaryCodec.writeInt(alignment.getReferenceIndex(header)); + // 0-based!! + this.binaryCodec.writeInt(alignment.getAlignmentStart() - 1); + this.binaryCodec.writeUByte((short)(alignment.getReadNameLength() + 1)); + this.binaryCodec.writeUByte((short)alignment.getMappingQuality()); + this.binaryCodec.writeUShort(indexBin); + this.binaryCodec.writeUShort(cigarLength); + this.binaryCodec.writeUShort(alignment.getFlags()); + this.binaryCodec.writeInt(alignment.getReadLength()); + this.binaryCodec.writeInt(alignment.getMateReferenceIndex(header)); + this.binaryCodec.writeInt(alignment.getMateAlignmentStart() - 1); + this.binaryCodec.writeInt(alignment.getInferredInsertSize()); + final byte[] variableLengthBinaryBlock = alignment.getVariableBinaryRepresentation(); + if (variableLengthBinaryBlock != null) { + this.binaryCodec.writeBytes(variableLengthBinaryBlock); + } else { + this.binaryCodec.writeString(alignment.getReadName(), false, true); + final int[] binaryCigar = cigarCodec.encode(alignment.getCigar()); + for (final int cigarElement : binaryCigar) { + // Assumption that this will fit into an integer, despite the fact + // that it is specced as a uint. + this.binaryCodec.writeInt(cigarElement); + } + this.binaryCodec.writeBytes(SAMUtils.bytesToCompressedBases(alignment.getReadBases())); + this.binaryCodec.writeBytes(alignment.getBaseQualities()); + if (alignment.getAttributes() != null) { + for (final Map.Entry attribute : alignment.getAttributes()) { + this.binaryTagCodec.writeTag(attribute.getKey(), attribute.getValue()); + } + } + } + } + + /** + * Read the next record from the input stream and convert into a java object. + * + * @return null if no more records. Should throw exception if EOF is encountered in the middle of + * a record. + */ + public SAMRecord decode() { + int recordLength = 0; + try { + recordLength = this.binaryCodec.readInt(); + } + catch (RuntimeEOFException e) { + return null; + } + + if (recordLength < BAMFileConstants.FIXED_BLOCK_SIZE || + recordLength > BAMFileConstants.MAXIMUM_RECORD_LENGTH) { + throw new SAMFormatException("Invalid record length: " + recordLength); + } + + final int referenceID = this.binaryCodec.readInt(); + final int coordinate = this.binaryCodec.readInt() + 1; + final short readNameLength = this.binaryCodec.readUByte(); + final short mappingQuality = this.binaryCodec.readUByte(); + final int bin = this.binaryCodec.readUShort(); + final int cigarLen = this.binaryCodec.readUShort(); + final int flags = this.binaryCodec.readUShort(); + final int readLen = this.binaryCodec.readInt(); + final int mateReferenceID = this.binaryCodec.readInt(); + final int mateCoordinate = this.binaryCodec.readInt() + 1; + final int insertSize = this.binaryCodec.readInt(); + final byte[] restOfRecord = new byte[recordLength - BAMFileConstants.FIXED_BLOCK_SIZE]; + this.binaryCodec.readBytes(restOfRecord); + final BAMRecord ret = new BAMRecord(header, referenceID, coordinate, readNameLength, mappingQuality, + bin, cigarLen, flags, readLen, mateReferenceID, mateCoordinate, insertSize, restOfRecord); + ret.setHeader(header); + return ret; + } +} diff --git a/lib/edu/mit/broad/sam/BinaryCigarCodec.java b/lib/edu/mit/broad/sam/BinaryCigarCodec.java new file mode 100644 index 0000000000..5455f65323 --- /dev/null +++ b/lib/edu/mit/broad/sam/BinaryCigarCodec.java @@ -0,0 +1,68 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import java.nio.ByteBuffer; + +/** + * Converter between binary and text CIGAR representation. + */ +class BinaryCigarCodec { + private static final BinaryCigarCodec singleton = new BinaryCigarCodec(); + + /** + * It is not necssary to get the singleton but it is preferrable to use the same one + * over and over vs. creating a new object for each BAMRecord. + */ + static BinaryCigarCodec getSingleton() { + return singleton; + } + + int[] encode(final Cigar cigar) { + if (cigar.numCigarElements() == 0) { + return new int[0]; + } + + // Binary rep can be no longer than 1/2 of text rep + // Although this is documented as uint, I think lengths will never get that long, + // and it's a pain in Java. + final int[] binaryCigar = new int[cigar.numCigarElements()]; + int binaryCigarLength = 0; + for (int i = 0; i < cigar.numCigarElements(); ++i) { + final CigarElement cigarElement = cigar.getCigarElement(i); + final int op = CigarOperator.enumToBinary(cigarElement.getOperator()); + binaryCigar[binaryCigarLength++] = cigarElement.getLength() << 4 | op; + } + return binaryCigar; + } + + Cigar decode(final ByteBuffer binaryCigar) { + final Cigar ret = new Cigar(); + while (binaryCigar.hasRemaining()) { + final int cigarette = binaryCigar.getInt(); + ret.add(binaryCigarToCigarElement(cigarette)); + } + return ret; + } + + Cigar decode(final int[] binaryCigar) { + final Cigar ret = new Cigar(); + for (final int cigarette : binaryCigar) { + ret.add(binaryCigarToCigarElement(cigarette)); + } + return ret; + } + + private static CigarElement binaryCigarToCigarElement(final int cigarette) { + final int binaryOp = cigarette & 0xf; + final int length = cigarette >> 4; + return new CigarElement(length, CigarOperator.binaryToEnum(binaryOp)); + } +} diff --git a/lib/edu/mit/broad/sam/BinaryTagCodec.java b/lib/edu/mit/broad/sam/BinaryTagCodec.java new file mode 100644 index 0000000000..fbb8711c50 --- /dev/null +++ b/lib/edu/mit/broad/sam/BinaryTagCodec.java @@ -0,0 +1,211 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.BinaryCodec; +import edu.mit.broad.sam.util.RuntimeEOFException; + +import java.util.Map; +import java.util.Collection; +import java.util.ArrayList; + +/** + * Parse & produce tag section of alignment record in BAM file. + */ +class BinaryTagCodec { + // Size of the fixed part of the binary representation of a tag, + // i.e. the number of bytes occupied by the tag name and tag type fields. + private static final int FIXED_TAG_SIZE = 3; + + private static final long MAX_INT = Integer.MAX_VALUE; + private static final long MAX_UINT = (MAX_INT + 1) * 2; + private static final long MAX_SHORT = Short.MAX_VALUE; + private static final long MAX_USHORT = (MAX_SHORT + 1) * 2; + private static final long MAX_BYTE = Byte.MAX_VALUE; + private static final long MAX_UBYTE = (MAX_BYTE + 1) * 2; + + final BinaryCodec binaryCodec; + + BinaryTagCodec(final BinaryCodec binaryCodec) { + this.binaryCodec = binaryCodec; + } + + private static int getBinaryValueSize(final Object attributeValue) { + switch (getTagValueType(attributeValue)) { + case 'Z': + return ((String)attributeValue).length() + 1; + case 'A': + return 1; + case 'I': + case 'i': + return 4; + case 's': + case 'S': + return 2; + case 'c': + case 'C': + return 1; + case 'f': + return 4; + case 'H': + final byte[] byteArray = (byte[])attributeValue; + return byteArray.length * 2 + 1; + default: + throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + + attributeValue.getClass().getName()); + } + } + + static int getTagSize(final Object value) { + return FIXED_TAG_SIZE + getBinaryValueSize(value); + } + + static char getTagValueType(final Object value) { + if (value.getClass().equals(String.class)) { + return 'Z'; + } else if (value.getClass().equals(Character.class)) { + return 'A'; + } else if (value.getClass().equals(Integer.class)) { + return getIntegerType((Integer)value); + } else if (value.getClass().equals(Long.class)) { + return getIntegerType((Long)value); + } else if (value.getClass().equals(Float.class)) { + return 'f'; + } else if (value.getClass().isArray() && value.getClass().getComponentType().equals(Byte.class)) { + return 'H'; + } else { + throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + + value.getClass().getName()); + } + } + + static private char getIntegerType(final long val) { + if (val > MAX_UINT) { + throw new IllegalArgumentException("Integer attribute value too large to be encoded in BAM"); + } + if (val > MAX_INT) { + return 'I'; + } + if (val > MAX_USHORT) { + return 'i'; + } + if (val > MAX_SHORT) { + return 'S'; + } + if (val > MAX_UBYTE) { + return 's'; + } + if (val > MAX_BYTE) { + return 'C'; + } + if (val >= Byte.MIN_VALUE) { + return 'c'; + } + if (val >= Short.MIN_VALUE) { + return 's'; + } + if (val >= Integer.MIN_VALUE) { + return 'i'; + } + throw new IllegalArgumentException("Integer attribute value too negative to be encoded in BAM"); + } + + void writeTag(final String key, final Object value) { + assert(key.length() == 2); + binaryCodec.writeString(key, false, false); + final char tagValueType = getTagValueType(value); + binaryCodec.writeByte(tagValueType); + + switch (tagValueType) { + case 'Z': + binaryCodec.writeString((String)value, false, true); + break; + case 'A': + binaryCodec.writeByte(((Character)value)); + break; + case 'I': + binaryCodec.writeUInt((Long)value); + break; + case 'i': + binaryCodec.writeInt((Integer)value); + break; + case 's': + binaryCodec.writeShort(((Integer)value).shortValue()); + break; + case 'S': + binaryCodec.writeUShort((Integer)value); + break; + case 'c': + binaryCodec.writeByte((Integer)value); + break; + case 'C': + binaryCodec.writeUByte(((Integer)value).shortValue()); + break; + case 'f': + binaryCodec.writeFloat((Float)value); + break; + case 'H': + final byte[] byteArray = (byte[])value; + binaryCodec.writeString(SAMUtils.bytesToHexString(byteArray), false, true); + break; + default: + throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + + value.getClass().getName()); + } + } + + /** + * Reads tags from the binaryCodec passed in the ctor + * @param tagCollection tags are stored in this Map + */ + void readTags(final Map tagCollection) { + while (true) { + final String key; + try { + // Only way to know at end is when out of input + key = binaryCodec.readString(2); + } catch (RuntimeEOFException e) { + break; + } + final byte tagType = binaryCodec.readByte(); + final Object value = readValue(tagType); + tagCollection.put(key, value); + } + } + + private Object readValue(final byte tagType) { + switch (tagType) { + case 'Z': + return binaryCodec.readNullTerminatedString(); + case 'A': + return (char)binaryCodec.readByte(); + case 'I': + return binaryCodec.readUInt(); + case 'i': + return binaryCodec.readInt(); + case 's': + return (int)binaryCodec.readShort(); + case 'S': + return binaryCodec.readUShort(); + case 'c': + return (int)binaryCodec.readByte(); + case 'C': + return (int)binaryCodec.readUByte(); + case 'f': + return binaryCodec.readFloat(); + case 'H': + final String hexRep = binaryCodec.readNullTerminatedString(); + return SAMUtils.hexStringToBytes(hexRep); + default: + throw new SAMFormatException("Unrecognized tag type: " + (char)tagType); + } + } + +} diff --git a/lib/edu/mit/broad/sam/Cigar.java b/lib/edu/mit/broad/sam/Cigar.java new file mode 100644 index 0000000000..fa98526573 --- /dev/null +++ b/lib/edu/mit/broad/sam/Cigar.java @@ -0,0 +1,93 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import java.util.List; +import java.util.ArrayList; +import java.util.Collections; + +/** + * A list of CigarElements, which describes how a read aligns with the reference. + * E.g. the Cigar string 10M1D25M means + * * match or mismatch for 10 bases + * * deletion of 1 base + * * match or mismatch for 25 bases + */ +public class Cigar { + private final List cigarElements = new ArrayList(); + + public Cigar() { + } + + public Cigar(final List cigarElements) { + this.cigarElements.addAll(cigarElements); + } + + public List getCigarElements() { + return Collections.unmodifiableList(cigarElements); + } + + public CigarElement getCigarElement(final int i) { + return cigarElements.get(i); + } + + public void add(final CigarElement cigarElement) { + cigarElements.add(cigarElement); + } + + public int numCigarElements() { + return cigarElements.size(); + } + + public int getReferenceLength() { + int length = 0; + for (CigarElement element : cigarElements) { + switch (element.getOperator()) { + case M: + case D: + case N: + length += element.getLength(); + } + } + return length; + } + + public int getPaddedReferenceLength() { + int length = 0; + for (CigarElement element : cigarElements) { + switch (element.getOperator()) { + case M: + case D: + case N: + case P: + length += element.getLength(); + } + } + return length; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof Cigar)) return false; + + final Cigar cigar = (Cigar) o; + + if (cigarElements != null ? !cigarElements.equals(cigar.cigarElements) : cigar.cigarElements != null) + return false; + + return true; + } + + @Override + public int hashCode() { + return cigarElements != null ? cigarElements.hashCode() : 0; + } +} diff --git a/lib/edu/mit/broad/sam/CigarElement.java b/lib/edu/mit/broad/sam/CigarElement.java new file mode 100644 index 0000000000..eec99106b2 --- /dev/null +++ b/lib/edu/mit/broad/sam/CigarElement.java @@ -0,0 +1,52 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +/** + * One component of a cigar string. The component comprises the operator, and the number of bases to which + * the operator applies. + */ +public class CigarElement { + private final int length; + private final CigarOperator operator; + + public CigarElement(final int length, final CigarOperator operator) { + this.length = length; + this.operator = operator; + } + + public int getLength() { + return length; + } + + public CigarOperator getOperator() { + return operator; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof CigarElement)) return false; + + final CigarElement that = (CigarElement) o; + + if (length != that.length) return false; + if (operator != that.operator) return false; + + return true; + } + + @Override + public int hashCode() { + int result = length; + result = 31 * result + (operator != null ? operator.hashCode() : 0); + return result; + } +} diff --git a/lib/edu/mit/broad/sam/CigarOperator.java b/lib/edu/mit/broad/sam/CigarOperator.java new file mode 100644 index 0000000000..7445455e23 --- /dev/null +++ b/lib/edu/mit/broad/sam/CigarOperator.java @@ -0,0 +1,113 @@ +package edu.mit.broad.sam;/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +/** + * The operators that can appear in a cigar string. + */ +public enum CigarOperator { + M, + I, + D, + N, + S, + H, + P, + C; // I don't know what C means, but it is in the BAM spec + + // Readable synonyms of the above enums + public static final CigarOperator MATCH_OR_MISMATCH = M; + public static final CigarOperator INSERTION = I; + public static final CigarOperator DELETION = D; + public static final CigarOperator SKIPPED_REGION = N; + public static final CigarOperator SOFT_CLIP = S; + public static final CigarOperator HARD_CLIP = H; + public static final CigarOperator PADDING = P; + + // Representation of CigarOperator in BAM file + private static final byte OP_M = 0; + private static final byte OP_I = 1; + private static final byte OP_D = 2; + private static final byte OP_N = 3; + private static final byte OP_S = 4; + private static final byte OP_H = 5; + private static final byte OP_P = 6; + private static final byte OP_C = 7; + + + + public static CigarOperator characterToEnum(final int b) { + switch (b) { + case 'M': + return M; + case 'I': + return I; + case 'D': + return D; + case 'N': + return N; + case 'S': + return S; + case 'H': + return H; + case 'P': + return P; + case 'C': + return C; + default: + throw new IllegalArgumentException("Unrecognized CigarOperator: " + b); + } + } + + public static CigarOperator binaryToEnum(final int i) { + switch(i) { + case OP_M: + return M; + case OP_I: + return I; + case OP_D: + return D; + case OP_N: + return N; + case OP_S: + return S; + case OP_H: + return H; + case OP_P: + return P; + case OP_C: + return C; + default: + throw new IllegalArgumentException("Unrecognized CigarOperator: " + i); + } + } + + public static int enumToBinary(final CigarOperator e) { + switch(e) { + case M: + return OP_M; + case I: + return OP_I; + case D: + return OP_D; + case N: + return OP_N; + case S: + return OP_S; + case H: + return OP_H; + case P: + return OP_P; + case C: + return OP_C; + default: + throw new IllegalArgumentException("Unrecognized CigarOperator: " + e); + } + } +} diff --git a/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java b/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java new file mode 100644 index 0000000000..7191cc14dc --- /dev/null +++ b/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java @@ -0,0 +1,37 @@ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.NonDestructiveIterator; + +/** + * Wrapper around SAMRecord iterator that skips over non-primary elements. + */ +public class NotPrimarySkippingIterator { + private final NonDestructiveIterator> it; + + public NotPrimarySkippingIterator(final CloseableIterator underlyingIt) { + it = new NonDestructiveIterator>(underlyingIt); + skipAnyNotprimary(); + } + + public boolean hasCurrent() { + return it.hasCurrent(); + } + + public SAMRecord getCurrent() { + assert(hasCurrent()); + return it.getCurrent(); + } + + public boolean advance() { + it.advance(); + skipAnyNotprimary(); + return hasCurrent(); + } + + private void skipAnyNotprimary() { + while (it.hasCurrent() && it.getCurrent().getNotPrimaryAlignmentFlag()) { + it.advance(); + } + } +} diff --git a/lib/edu/mit/broad/sam/SAMFileHeader.java b/lib/edu/mit/broad/sam/SAMFileHeader.java new file mode 100644 index 0000000000..95d39f1202 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMFileHeader.java @@ -0,0 +1,191 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import java.util.*; + +/** + * Header information from a SAM file. + */ +public class SAMFileHeader +{ + public static final String VERSION_TAG = "VN"; + public static final String CURRENT_VERSION = "1.0"; + + public enum SortOrder { + + unsorted(null), + queryname(SAMRecordQueryNameComparator.class), + coordinate(SAMRecordCoordinateComparator.class); + + private Class comparator; + + SortOrder(final Class comparatorClass) { + this.comparator = comparatorClass; + } + + public Class getComparator() { + return comparator; + } + } + + public enum GroupOrder { + none, query, reference + } + + private final Map mAttributes = + new HashMap(); + private List mSequences = + new ArrayList(); + private List mReadGroups = + new ArrayList(); + private final List mProgramRecords = new ArrayList(); + private final Map mSequenceMap = + new HashMap(); + private final Map mReadGroupMap = + new HashMap(); + private Map mProgramRecordMap = new HashMap(); + + public SAMFileHeader() { + setAttribute(VERSION_TAG, CURRENT_VERSION); + } + + public String getVersion() { + return (String) getAttribute("VN"); + } + + public String getCreator() { + return (String) getAttribute("CR"); + } + + public Object getAttribute(final String key) { + return mAttributes.get(key); + } + + public Set> getAttributes() { + return mAttributes.entrySet(); + } + + public List getSequences() { + return mSequences; + } + + public List getReadGroups() { + return mReadGroups; + } + + public SAMSequenceRecord getSequence(final String name) { + return mSequenceMap.get(name); + } + + public SAMReadGroupRecord getReadGroup(final String name) { + return mReadGroupMap.get(name); + } + + public void setSequences(final List list) { + mSequences = list; + mSequenceMap.clear(); + int index = 0; + for (final SAMSequenceRecord record : list) { + record.setSequenceIndex(index++); + mSequenceMap.put(record.getSequenceName(), record); + } + } + + public SAMSequenceRecord getSequence(final int sequenceIndex) { + if (sequenceIndex < 0 || sequenceIndex >= mSequences.size()) { + return null; + } + return mSequences.get(sequenceIndex); + } + + public int getSequenceIndex(final String sequenceName) { + final SAMSequenceRecord record = mSequenceMap.get(sequenceName); + if (record == null) { + return -1; + } + return record.getSequenceIndex(); + } + + public void setAttribute(final String key, final String value) { + mAttributes.put(key, value); + } + + public void setReadGroups(final List readGroups) { + mReadGroups = readGroups; + mReadGroupMap.clear(); + for (final SAMReadGroupRecord readGroupRecord : readGroups) { + mReadGroupMap.put(readGroupRecord.getReadGroupId(), readGroupRecord); + } + } + + public List getProgramRecords() { + return Collections.unmodifiableList(mProgramRecords); + } + + public void addProgramRecord(final SAMProgramRecord programRecord) { + this.mProgramRecords.add(programRecord); + this.mProgramRecordMap.put(programRecord.getProgramGroupId(), programRecord); + } + + public SAMProgramRecord getProgramRecord(final String name) { + return this.mProgramRecordMap.get(name); + } + + public SortOrder getSortOrder() { + if (getAttribute("SO") == null) { + return SortOrder.unsorted; + } + return SortOrder.valueOf((String)getAttribute("SO")); + } + + public void setSortOrder(final SortOrder so) { + setAttribute("SO", so.name()); + } + + public GroupOrder getGroupOrder() { + if (getAttribute("GO") == null) { + return GroupOrder.none; + } + return GroupOrder.valueOf((String)getAttribute("GO")); + } + + public void setGroupOrder(final GroupOrder go) { + setAttribute("GO", go.name()); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final SAMFileHeader that = (SAMFileHeader) o; + + if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; + if (mProgramRecords != null ? !mProgramRecords.equals(that.mProgramRecords) : that.mProgramRecords != null) + return false; + if (mReadGroups != null ? !mReadGroups.equals(that.mReadGroups) : that.mReadGroups != null) return false; + if (mSequences != null ? !mSequences.equals(that.mSequences) : that.mSequences != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = mAttributes != null ? mAttributes.hashCode() : 0; + result = 31 * result + (mSequences != null ? mSequences.hashCode() : 0); + result = 31 * result + (mReadGroups != null ? mReadGroups.hashCode() : 0); + result = 31 * result + (mReadGroupMap != null ? mReadGroupMap.hashCode() : 0); + result = 31 * result + (mProgramRecords != null ? mProgramRecords.hashCode() : 0); + return result; + } +} diff --git a/lib/edu/mit/broad/sam/SAMFileReader.java b/lib/edu/mit/broad/sam/SAMFileReader.java new file mode 100644 index 0000000000..8c0e449191 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMFileReader.java @@ -0,0 +1,213 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.RuntimeIOException; +import edu.mit.broad.sam.util.BlockCompressedInputStream; + +import java.io.*; + + +/** + * Class for reading and querying SAM/BAM files. + */ +public class SAMFileReader implements Iterable +{ + private boolean mIsBinary = false; + private BAMFileIndex mFileIndex = null; + private ReaderImplementation mReader = null; + + public enum ValidationStringency { + STRICT, // Do the right thing, throw an exception if something looks wrong + LENIENT, // Emit warnings but keep going if possible + SILENT; // Like LENIENT, only don't emit warning messages + + public static ValidationStringency DEFAULT_STRINGENCY = STRICT; + } + + /** + * Internal interface for SAM/BAM file reader implementations. + * Implemented as an abstract class to enforce better access control. + */ + static abstract class ReaderImplementation { + abstract SAMFileHeader getFileHeader(); + abstract CloseableIterator getIterator(); + abstract CloseableIterator query(String sequence, int start, int end, boolean contained); + abstract void close(); + // If true, emit warnings about format errors rather than throwing exceptions; + abstract void setValidationStringency(final ValidationStringency validationStringency); + } + + + public SAMFileReader(final InputStream stream) { + this(stream, false); + } + + public SAMFileReader(final File file) { + this(file, null, false); + } + + public SAMFileReader(final File file, final File indexFile) { + this(file, indexFile, false); + } + + /** + * Read a SAM or BAM file + * @param stream input SAM or BAM + * @param eagerDecode if true, decode SAM record entirely when reading it + */ + public SAMFileReader(final InputStream stream, final boolean eagerDecode) { + init(stream, eagerDecode); + } + + /** + * Read a SAM or BAM file, possibly with an index file if present + * @param file where to read from + * @param eagerDecode if true, decode SAM record entirely when reading it + */ + public SAMFileReader(final File file, final boolean eagerDecode) { + init(file, null, eagerDecode); + } + + /** + * Read a SAM or BAM file, possibly with an index file + * @param file where to read from + * @param indexFile location of index file, or null in order to use the default index file (if present) + * @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it + */ + public SAMFileReader(final File file, final File indexFile, final boolean eagerDecode){ + init(file, indexFile, eagerDecode); + } + + public void close() { + if (mReader != null) { + mReader.close(); + } + if (mFileIndex != null) { + mFileIndex.close(); + } + mReader = null; + mFileIndex = null; + } + + public boolean isBinary() { + return mIsBinary; + } + + public boolean hasIndex() { + return (mFileIndex != null); + } + + public SAMFileHeader getFileHeader() { + return mReader.getFileHeader(); + } + + public void setValidationStringency(final ValidationStringency validationStringency) { + mReader.setValidationStringency(validationStringency); + } + + public CloseableIterator iterator() { + return mReader.getIterator(); + } + + public CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + return mReader.query(sequence, start, end, contained); + } + + public CloseableIterator queryOverlapping(final String sequence, final int start, final int end) { + return query(sequence, start, end, false); + } + + public CloseableIterator queryContained(final String sequence, final int start, final int end) { + return query(sequence, start, end, true); + } + + private void init(final InputStream stream, final boolean eagerDecode) { + + try { + final BufferedInputStream bufferedStream = toBufferedStream(stream); + if (isBAMFile(bufferedStream)) { + mIsBinary = true; + mReader = new BAMFileReader(bufferedStream, eagerDecode); + } else if (isSAMFile(bufferedStream)) { + mIsBinary = false; + mReader = new SAMTextReader(bufferedStream); + } else { + throw new SAMFormatException("Unrecognized file format"); + } + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private void init(final File file, File indexFile, final boolean eagerDecode) { + + try { + final BufferedInputStream bufferedStream = + new BufferedInputStream(new FileInputStream(file)); + if (isBAMFile(bufferedStream)) { + bufferedStream.close(); + mIsBinary = true; + final BAMFileReader reader = new BAMFileReader(file, eagerDecode); + mReader = reader; + if (indexFile == null) { + indexFile = findIndexFile(file); + } + if (indexFile != null) { + mFileIndex = new BAMFileIndex(indexFile); + reader.setFileIndex(mFileIndex); + } + } else if (isSAMFile(bufferedStream)) { + if (indexFile != null) { + bufferedStream.close(); + throw new RuntimeException("Cannot use index file with textual SAM file"); + } + mIsBinary = false; + mReader = new SAMTextReader(bufferedStream, file); + } else { + bufferedStream.close(); + throw new SAMFormatException("Unrecognized file format"); + } + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private File findIndexFile(final File dataFile) { + final File indexFile = + new File(dataFile.getParent(), dataFile.getName() + ".bai"); + if (indexFile.exists()) { + return indexFile; + } else { + return null; + } + } + + private boolean isBAMFile(final InputStream stream) + throws IOException { + return BlockCompressedInputStream.isValidFile(stream); + } + + private boolean isSAMFile(final InputStream stream) { + // For now, assume every non-binary file is a SAM text file. + return true; + } + + private BufferedInputStream toBufferedStream(final InputStream stream) { + if (stream instanceof BufferedInputStream) { + return (BufferedInputStream) stream; + } else { + return new BufferedInputStream(stream); + } + } +} diff --git a/lib/edu/mit/broad/sam/SAMFileWriter.java b/lib/edu/mit/broad/sam/SAMFileWriter.java new file mode 100644 index 0000000000..2d57854b5b --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMFileWriter.java @@ -0,0 +1,23 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +/** + * Interface for SAMText and BAM file writers. Clients need not care which they write to, + * once the object is constructed. + */ +public interface SAMFileWriter { + void addAlignment(SAMRecord alignment); + + /** + * Must be called or file will likely be defective. + */ + void close(); +} diff --git a/lib/edu/mit/broad/sam/SAMFileWriterFactory.java b/lib/edu/mit/broad/sam/SAMFileWriterFactory.java new file mode 100644 index 0000000000..3d75948557 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMFileWriterFactory.java @@ -0,0 +1,64 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import java.io.File; + +/** + * Create a SAMFileWriter for writing SAM or BAM. + */ +public class SAMFileWriterFactory { + + /** + * Create a BAMFileWriter that is ready to receive SAMRecords + * @param header entire header. Sort order is determined by the sortOrder property of this arg + * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder + * @param outputFile where to write the output. + * @return + */ + public SAMFileWriter makeBAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { + final BAMFileWriter ret = new BAMFileWriter(outputFile); + ret.setSortOrder(header.getSortOrder(), presorted); + ret.setHeader(header); + return ret; + } + + /** + * Create a SAMTextWriter that is ready to receive SAMRecords + * @param header entire header. Sort order is determined by the sortOrder property of this arg + * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder + * @param outputFile where to write the output. + * @return + */ + public SAMFileWriter makeSAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { + final SAMTextWriter ret = new SAMTextWriter(outputFile); + ret.setSortOrder(header.getSortOrder(), presorted); + ret.setHeader(header); + return ret; + } + + /** + * Create either a SAM or a BAM writer based on examination of the outputFile + * @param header entire header. Sort order is determined by the sortOrder property of this arg + * @param presorted presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder + * @param outputFile + * @return outputFile where to write the output. Must end with .sam or .bam + */ + public SAMFileWriter makeSAMOrBAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { + final String filename = outputFile.getName(); + if (filename.endsWith(".bam")) { + return makeBAMWriter(header, presorted, outputFile); + } + if (filename.endsWith(".sam")) { + return makeSAMWriter(header, presorted, outputFile); + } + throw new IllegalArgumentException("SAM/BAM file should end with .sam or .bam: " + outputFile); + } +} diff --git a/lib/edu/mit/broad/sam/SAMFileWriterImpl.java b/lib/edu/mit/broad/sam/SAMFileWriterImpl.java new file mode 100644 index 0000000000..78521af447 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMFileWriterImpl.java @@ -0,0 +1,157 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.SortingCollection; + +import java.io.File; +import java.io.StringWriter; + +/** + * Base class for implementing SAM writer with any underlying format. + * Mostly this manages accumulation & sorting of SAMRecords when appropriate, + * and produces the text version of the header, since that seems to be a popular item + * in both text and binary file formats. + */ +abstract class SAMFileWriterImpl implements SAMFileWriter +{ + private static final int MAX_RECORDS_IN_RAM = 500000; + private SAMFileHeader.SortOrder sortOrder; + private SAMFileHeader header; + private SortingCollection alignmentSorter; + + // If true, records passed to addAlignment are already in the order specified by sortOrder + private boolean presorted; + + // These two fields are for validating presorted records. + private SAMRecord prevAlignment; + private SAMRecordComparator presortedComparator; + + /** + * Must be called before calling writeHeader(). SortOrder value in the header passed + * to writeHeader() is ignored. If setSortOrder is not called, default is SortOrder.unsorted + * @param sortOrder + */ + public void setSortOrder(final SAMFileHeader.SortOrder sortOrder, final boolean presorted) { + if (header != null) { + throw new IllegalStateException("Cannot call SAMFileWriterImpl.setSortOrder after setHeader for " + + getFilename()); + } + this.sortOrder = sortOrder; + this.presorted = presorted; + } + + /** + * Must be called before addAlignment. + * @param header + */ + public void setHeader(final SAMFileHeader header) + { + this.header = header; + if (sortOrder == null) { + sortOrder = SAMFileHeader.SortOrder.unsorted; + } + header.setSortOrder(sortOrder); + final StringWriter headerTextBuffer = new StringWriter(); + new SAMTextHeaderCodec().encode(headerTextBuffer, header); + final String headerText = headerTextBuffer.toString(); + + writeHeader(headerText); + + if (presorted) { + if (sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { + presorted = false; + } else { + presortedComparator = makeComparator(); + } + } else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { + alignmentSorter = SortingCollection.newInstance(SAMRecord.class, + new BAMRecordCodec(header), makeComparator(), MAX_RECORDS_IN_RAM); + } + } + + protected SAMFileHeader getHeader() { + return header; + } + + private SAMRecordComparator makeComparator() { + switch (sortOrder) { + case coordinate: + return new SAMRecordCoordinateComparator(header); + case queryname: + return new SAMRecordQueryNameComparator(); + case unsorted: + return null; + } + throw new IllegalStateException("sortOrder should not be null"); + } + + public void addAlignment(final SAMRecord alignment) + { + if (sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { + if (!header.getGroupOrder().equals(SAMFileHeader.GroupOrder.none)) { + throw new UnsupportedOperationException("GroupOrder " + header.getGroupOrder() + " is not supported"); + } + writeAlignment(alignment); + } else if (presorted) { + assertPresorted(alignment); + writeAlignment(alignment); + } else { + alignmentSorter.add(alignment); + } + } + + private void assertPresorted(final SAMRecord alignment) { + if (prevAlignment != null) { + if (presortedComparator.fileOrderCompare(prevAlignment, alignment) > 0) { + throw new IllegalArgumentException("Alignments added out of order in SAMFileWriterImpl.addAlignment for " + + getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at [" + + prevAlignment.getReferenceName() + ":" + prevAlignment.getAlignmentStart() + "] and [" + + alignment.getReferenceName() + ":" + alignment.getAlignmentStart() + "]"); + } + } + prevAlignment = alignment; + } + + public final void close() + { + if (alignmentSorter != null) { + for (final SAMRecord alignment : alignmentSorter) { + writeAlignment(alignment); + } + alignmentSorter.cleanup(); + } + finish(); + } + + /** + * Writes the record to disk. Sort order has been taken care of by the time + * this method is called. + * @param alignment + */ + abstract protected void writeAlignment(SAMRecord alignment); + + /** + * Write the header to disk. Header object is available via getHeader(). + * @param textHeader for convenience if the implementation needs it. + */ + abstract protected void writeHeader(String textHeader); + + /** + * Do any required flushing here. + */ + abstract protected void finish(); + + /** + * For producing error messages. + * @return Output filename, or null if there isn't one. + */ + abstract protected String getFilename(); +} diff --git a/lib/edu/mit/broad/sam/SAMFormatException.java b/lib/edu/mit/broad/sam/SAMFormatException.java new file mode 100644 index 0000000000..f055d10758 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMFormatException.java @@ -0,0 +1,30 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +/** + * Thrown when a SAM file being read (text or binary) looks bad. + */ +public class SAMFormatException extends RuntimeException { + public SAMFormatException() { + } + + public SAMFormatException(final String s) { + super(s); + } + + public SAMFormatException(final String s, final Throwable throwable) { + super(s, throwable); + } + + public SAMFormatException(final Throwable throwable) { + super(throwable); + } +} diff --git a/lib/edu/mit/broad/sam/SAMLocusIterator.java b/lib/edu/mit/broad/sam/SAMLocusIterator.java new file mode 100644 index 0000000000..e494d389a2 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMLocusIterator.java @@ -0,0 +1,308 @@ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.arachne.GenomeMask; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class SAMLocusIterator implements Iterable, CloseableIterator { + public static class LocusInfo { + protected final String chrom; + protected final int position; + protected final List bases = new ArrayList(100); + protected final List qualities = new ArrayList(100); + protected final List negativeStrandFlags = new ArrayList(100); + + LocusInfo(final String chrom, final int position) { + this.chrom = chrom; + this.position = position; + } + + public void add(final Byte readBase, final Byte baseQuality, final boolean strand) { + bases.add(readBase); + qualities.add(baseQuality); + negativeStrandFlags.add(strand); + } + + public String getChrom() { return chrom; } + public int getPosition() { return position; } + public List getBases() { return bases; } + public List getQualities() { return qualities; } + public List getNegativeStrandFlags() { return negativeStrandFlags; } + + public String getBasesAsString() { return bytesToString(bases); } + + private static String bytesToString(final List data) { + if (data == null || data.size() == 0) { + return ""; + } + + final char[] chars = new char[data.size()]; + for (int i = 0; i < data.size(); i++) { + chars[i] = (char) (data.get(i) & 0xFF); + } + return new String(chars); + } + } + + + + + private final CloseableIterator underlyingIterator; + private final NotPrimarySkippingIterator it; + private final LinkedList complete = new LinkedList(); + private final LinkedList accumulator = new LinkedList(); + + private boolean includeNonPfReads = false; + private boolean includeDuplicates = false; + private int qualityScoreCutoff = -Integer.MAX_VALUE; + + private GenomeMask mask; + private int lastContig = 0; + private int lastPosition = 0; + + private boolean finishedAlignedReads = false; + + + // this should probably take a SAM + public SAMLocusIterator(final CloseableIterator samIterator) { + this.underlyingIterator = samIterator; + this.it = new NotPrimarySkippingIterator(samIterator); + } + + public Iterator iterator() { + return this; + } + + public void close() { + this.underlyingIterator.close(); + } + + private boolean samHasMore() { + return !finishedAlignedReads && it.hasCurrent(); + } + public boolean hasNext() { + return ((complete.size() > 0) || (accumulator.size() > 0) || (samHasMore()) || hasRemainingMaskBases()); + } + + private boolean hasRemainingMaskBases() { + if (mask == null) return false; + + // if there are more contigs in the mask, by definition some of them must have + // marked bases otherwise if we're in the last contig, but we're not at the last marked position, + // there is also more in the mask + return (lastContig <= mask.getMaxContig() || + (lastContig == mask.getMaxContig() && lastPosition <= mask.get(lastContig).nextSetBit(lastPosition+1))); + } + + public LocusInfo next() { + + // if we don't have any completed entries to return, try and make some! + while(complete.size() == 0 && samHasMore()) { + final SAMRecord rec = it.getCurrent(); + final String cigar = rec.getCigarString(); + + // as soon as we hit our first non-aligned read, we can stop! + if (cigar.equals("*")) { + this.finishedAlignedReads = true; + continue; + } + + // skip dupe reads, if so requested + if (!isIncludeDuplicates() && rec.getDuplicateReadFlag()) { it.advance(); continue; } + + // skip non-PF reads, if so requested + if (!isIncludeNonPfReads() && rec.getReadFailsVendorQualityCheckFlag()) { it.advance(); continue; } + + // when we switch contigs, emit everything in the accumulator + if (accumulator.size() > 0 && !accumulator.getFirst().chrom.equals(rec.getReferenceName())) { + while (accumulator.size() > 0) { + popLocus(); + } + } + + // pop off things we're not going to accumulate more coverage at the locus in question + while(accumulator.size() > 0 && accumulator.getFirst().position < rec.getAlignmentStart()) { + popLocus(); + } + + // check that it's a non-gapped alignment for now! + // TODO: handle gapped and clipped alignments + if (!cigar.matches("[0-9]+M")) { + System.out.println("Cannot deal with clipped or gapped alignments. CIGAR="+cigar); + System.exit(1); + } + + // at this point, either the list is empty or the head should + // be the same position as the first base of the read + + // interpret the CIGAR string and add the base info + for(int j=0; j < rec.getReadBases().length; j++) { + // if the position is empty, initialize it + if (j > accumulator.size() - 1) { + accumulator.add(new LocusInfo(rec.getReferenceName(), rec.getAlignmentStart() + j)); + } + + // if the quality score cutoff is met, accumulate the base info + if (rec.getBaseQualities()[j] >= getQualityScoreCutoff()) { + accumulator.get(j).add(rec.getReadBases()[j], rec.getBaseQualities()[j], rec.getReadNegativeStrandFlag()); + } + } + + + it.advance(); + } + + // if we have nothing to return to the user, and we're at the end of the SAM iterator, + // push everything into the complete queue + if (complete.size() == 0 && !samHasMore()) { + while(accumulator.size() > 0) { + popLocus(); + } + } + + // if there are completed entries, return those + if (complete.size() > 0) { + return complete.removeFirst(); + } else { + + // In this case... we're past the last read from SAM so see if we can + // fill out any more (zero coverage) entries from the mask + LocusInfo zeroResult = null; + while (zeroResult == null && lastContig <= mask.getMaxContig()) { + final int nextbit = mask.get(lastContig).nextSetBit(lastPosition+1); + + // try the next contig + if (nextbit == -1) { + lastContig++; + lastPosition = 0; + } else { + lastPosition = nextbit; + zeroResult = new LocusInfo(contigToChrom[lastContig], lastPosition); + } + } + + return zeroResult; + } + } + + /** + * Pop the first entry from the LocusInfo accumulator into the complete queue. In addition, + * check the GenomeMask and if there are intervening mask positions between the last popped base and the one + * about to be popped, put those on the complete queue as well. + */ + private void popLocus() { + final LocusInfo li = accumulator.removeFirst(); + + // fill in any gaps based on our genome mask + final int liContig = chromToContig.get(li.getChrom()); + + // if we're not on the same contig, fill in the rest of the bits for the previous contig first... + if (lastContig < liContig) { + while (lastContig < liContig) { + int nextbit = 0; + + if (mask != null && mask.get(lastContig) != null) { + while (nextbit != -1) { + nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); + if (nextbit > -1) { + complete.addLast(new LocusInfo(contigToChrom[lastContig], nextbit)); + lastPosition = nextbit; + } + } + } + lastPosition=0; + lastContig++; + } + } + + // now that we're on the same contig, fill in any unfilled positions + // if we have some bits in the mask to fill in... + if (mask != null && mask.get(lastContig) != null && lastPosition + 1 < li.getPosition()) { + while (lastPosition + 1 < li.getPosition()) { + + final int nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); + + // if there are no more mask bits, or the next mask bit is + // at or after the current data, just continue on + if (nextbit == -1 || nextbit >= li.getPosition()) { break; } + + // otherwise, pop on the desired empty locus info + complete.addLast(new LocusInfo(contigToChrom[lastContig], nextbit)); + lastPosition = nextbit; + } + } + + // only add to the complete queue if it's in the mask (or we have no mask!) + if (mask == null || mask.get(chromToContig.get(li.getChrom()), li.getPosition())) { + complete.addLast(li); + } + + lastContig = liContig; + lastPosition = li.getPosition(); + + + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + // -------------------------------------------------------------------------------------------- + // Helper methods below this point... + // -------------------------------------------------------------------------------------------- + + public void setGenomeMask(final GenomeMask mask) { this.mask = mask; } + public GenomeMask getGenomeMask() { return this.mask; } + + public boolean isIncludeNonPfReads() { return includeNonPfReads; } + public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; } + + public boolean isIncludeDuplicates() { return includeDuplicates; } + public void setIncludeDuplicates(final boolean includeDuplicates) { this.includeDuplicates = includeDuplicates; } + + public int getQualityScoreCutoff() { return qualityScoreCutoff; } + public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; } + + + // TODO: once we have a foundation method for access to reference data, this should all change + // to be based on that, rather than this strange mashup of contig and chrom + private static final Map chromToContig = new HashMap(); + { + for(int i=1; i<=22; i++) { + chromToContig.put("chr"+i, i); + } + chromToContig.put("chrM", 0); + chromToContig.put("chrX", 23); + chromToContig.put("chrY", 24); + chromToContig.put("chr1_random", 25); + chromToContig.put("chr2_random", 26); + chromToContig.put("chr3_random", 27); + chromToContig.put("chr4_random", 28); + chromToContig.put("chr5_random", 29); + chromToContig.put("chr6_random", 30); + chromToContig.put("chr7_random", 31); + chromToContig.put("chr8_random", 32); + chromToContig.put("chr9_random", 33); + chromToContig.put("chr10_random", 34); + chromToContig.put("chr11_random", 35); + chromToContig.put("chr13_random", 36); + chromToContig.put("chr15_random", 37); + chromToContig.put("chr16_random", 38); + chromToContig.put("chr17_random", 39); + chromToContig.put("chr18_random", 40); + chromToContig.put("chr19_random", 41); + chromToContig.put("chr21_random", 42); + chromToContig.put("chr22_random", 43); + chromToContig.put("chrX_random", 44); + } + + private static final String[] contigToChrom = new String[] { "chrM","chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY", + "chr1_random","chr2_random","chr3_random","chr4_random","chr5_random","chr6_random","chr7_random","chr8_random","chr9_random","chr10_random","chr11_random","chr13_random","chr15_random","chr16_random","chr17_random","chr18_random","chr19_random","chr21_random","chr22_random","chrX_random" }; + + + +} diff --git a/lib/edu/mit/broad/sam/SAMProgramRecord.java b/lib/edu/mit/broad/sam/SAMProgramRecord.java new file mode 100644 index 0000000000..d2597adb35 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMProgramRecord.java @@ -0,0 +1,85 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +public class SAMProgramRecord { + public static final String PROGRAM_GROUP_ID_TAG = "ID"; + private static final String PROGRAM_VERSION_TAG = "VN"; + private static final String COMMAND_LINE_TAG = "CL"; + private final String mProgramGroupId; + private final Map mAttributes = new HashMap(); + + public SAMProgramRecord(final String programGroupId) { + this.mProgramGroupId = programGroupId; + } + + public String getProgramGroupId() { + return mProgramGroupId; + } + + public String getAttribute(final String key) { + return mAttributes.get(key); + } + + public void setAttribute(final String key, final String value) { + mAttributes.put(key, value); + } + + public Set> getAttributes() { + return mAttributes.entrySet(); + } + + public String getProgramVersion() { + return getAttribute(PROGRAM_VERSION_TAG); + } + + public void setProgramVersion(final String version) { + setAttribute(PROGRAM_VERSION_TAG, version); + } + + public String getCommandLine() { + return getAttribute(COMMAND_LINE_TAG); + } + + public void setCommandLine(final String commandLine) { + setAttribute(COMMAND_LINE_TAG, commandLine); + } + + /** + * @return true if this == that except for the program group ID, which is arbitrary + */ + public boolean equivalent(final SAMProgramRecord that) { + return mAttributes.equals(that.mAttributes); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final SAMProgramRecord that = (SAMProgramRecord) o; + + if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; + if (mProgramGroupId != null ? !mProgramGroupId.equals(that.mProgramGroupId) : that.mProgramGroupId != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = mProgramGroupId != null ? mProgramGroupId.hashCode() : 0; + result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); + return result; + } +} diff --git a/lib/edu/mit/broad/sam/SAMReadGroupRecord.java b/lib/edu/mit/broad/sam/SAMReadGroupRecord.java new file mode 100644 index 0000000000..3bdf1f6bb9 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMReadGroupRecord.java @@ -0,0 +1,84 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import java.util.*; + +/** + * Header information about a read group. + */ +public class SAMReadGroupRecord +{ + private String mReadGroupId = null; + private final Map mAttributes = new HashMap(); + public static final String READ_GROUP_ID_TAG = "ID"; + public static final String READ_GROUP_SAMPLE_TAG = "SM"; + public static final String PREDICTED_MEDIAN_INSERT_SIZE_TAG = "PI"; + public static final String DATE_RUN_PRODUCED_TAG = "DT"; + + public SAMReadGroupRecord(final String id) { + mReadGroupId = id; + } + + public String getReadGroupId() { + return mReadGroupId; + } + + public String getSample() { + return (String) getAttribute("SM"); + } + + public void setSample(final String value) { + setAttribute("SM", value); + } + + public String getLibrary() { + return (String) getAttribute("LB"); + } + + public void setLibrary(final String value) { + setAttribute("LB", value); + } + + public Object getAttribute(final String key) { + return mAttributes.get(key); + } + + public void setAttribute(final String key, final Object value) { + mAttributes.put(key, value); + } + + public Set> getAttributes() { + return mAttributes.entrySet(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final SAMReadGroupRecord that = (SAMReadGroupRecord) o; + + if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; + if (mReadGroupId != null ? !mReadGroupId.equals(that.mReadGroupId) : that.mReadGroupId != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = mReadGroupId != null ? mReadGroupId.hashCode() : 0; + result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); + return result; + } +} + diff --git a/lib/edu/mit/broad/sam/SAMRecord.java b/lib/edu/mit/broad/sam/SAMRecord.java new file mode 100644 index 0000000000..ca603994df --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMRecord.java @@ -0,0 +1,732 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import edu.mit.broad.sam.util.StringUtil; + +import java.util.*; + +/** + * Java binding for a SAM file record. + */ +public class SAMRecord +{ + public static final int UNKNOWN_MAPPING_QUALITY = 255; + public static final int NO_MAPPING_QUALITY = 0; + public static final String NO_ALIGNMENT_REFERENCE_NAME = "*"; + public static final String NO_ALIGNMENT_CIGAR = "*"; + public static final int NO_ALIGNMENT_START = 0; + public static final byte[] NULL_SEQUENCE = "*".getBytes(); + public static final byte[] NULL_QUALS = "*".getBytes(); + private static final int READ_PAIRED_FLAG = 0x1; + private static final int PROPER_PAIR_FLAG = 0x2; + private static final int READ_UNMAPPED_FLAG = 0x4; + private static final int MATE_UNMAPPED_FLAG = 0x8; + private static final int READ_STRAND_FLAG = 0x10; + private static final int MATE_STRAND_FLAG = 0x20; + private static final int FIRST_OF_PAIR_FLAG = 0x40; + private static final int SECOND_OF_PAIR_FLAG = 0x80; + private static final int NOT_PRIMARY_ALIGNMENT_FLAG = 0x100; + private static final int READ_FAILS_VENDOR_QUALITY_CHECK_FLAG = 0x200; + private static final int DUPLICATE_READ_FLAG = 0x400; + + + private String mReadName = null; + private byte[] mReadBases = NULL_SEQUENCE; + private byte[] mBaseQualities = NULL_QUALS; + private String mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + private int mAlignmentStart = NO_ALIGNMENT_START; + private int mMappingQuality = NO_MAPPING_QUALITY; + private String mCigarString = NO_ALIGNMENT_CIGAR; + private Cigar mCigar = null; + private List mAlignmentBlocks = null; + private int mFlags = 0; + private String mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + private int mMateAlignmentStart = 0; + private int mInferredInsertSize = 0; + private Map mAttributes = null; + private Integer mReferenceIndex = null; + private Integer mMateReferenceIndex = null; + private Integer mIndexingBin = null; + + // Optional, but handy for looking of reference indices + private SAMFileHeader mHeader = null; + + + public SAMRecord() { + } + + public String getReadName() { + return mReadName; + } + + /** + * This method is preferred over getReadName().length(), because for BAMRecord + * it may be faster. + * @return length not including a null terminator + */ + public int getReadNameLength() { + return mReadName.length(); + } + + public void setReadName(final String value) { + mReadName = value; + } + + public String getReadString() { + return StringUtil.bytesToString(getReadBases()); + } + + public void setReadString(final String value) { + mReadBases = StringUtil.stringToBytes(value); + } + + // Read bases, as bytes + public byte[] getReadBases() { + return mReadBases; + } + + public void setReadBases(final byte[] value) { + mReadBases = value; + } + + /** + * This method is preferred over getReadBases().length, because for BAMRecord it may be faster. + * @return number of bases in the read + */ + public int getReadLength() { + return getReadBases().length; + } + + // Base qualities, encoded as a FASTQ string + public String getBaseQualityString() { + return SAMUtils.phredToFastq(getBaseQualities()); + } + + public void setBaseQualityString(final String value) { + setBaseQualities(SAMUtils.fastqToPhred(value)); + } + + public byte[] getBaseQualities() { + return mBaseQualities; + } + + public void setBaseQualities(final byte[] value) { + mBaseQualities = value; + } + + public String getReferenceName() { + return mReferenceName; + } + + public void setReferenceName(final String value) { + mReferenceName = value; + mReferenceIndex = null; + } + + public Integer getReferenceIndex(final SAMFileHeader header) { + if (mReferenceIndex == null) { + if (mReferenceName == null) { + mReferenceIndex = -1; + } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(mReferenceName)) { + mReferenceIndex = -1; + } else { + mReferenceIndex = header.getSequenceIndex(mReferenceName); + } + } + return mReferenceIndex; + } + + public Integer getReferenceIndex() { + return getReferenceIndex(mHeader); + } + + + public void setReferenceIndex(final int referenceIndex, final SAMFileHeader header) { + mReferenceIndex = referenceIndex; + if (mReferenceIndex == -1) { + mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + } else { + mReferenceName = header.getSequence(referenceIndex).getSequenceName(); + } + } + + public void setReferenceIndex(final int referenceIndex) { + setReferenceIndex(referenceIndex, mHeader); + } + + + public String getMateReferenceName() { + return mMateReferenceName; + } + + public void setMateReferenceName(final String mateReferenceName) { + this.mMateReferenceName = mateReferenceName; + mMateReferenceIndex = null; + } + + public Integer getMateReferenceIndex(final SAMFileHeader header) { + if (mMateReferenceIndex == null) { + if (mMateReferenceName == null) { + mMateReferenceIndex = -1; + } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(mMateReferenceName)){ + mMateReferenceIndex = -1; + } else { + mMateReferenceIndex = header.getSequenceIndex(mMateReferenceName); + } + } + return mMateReferenceIndex; + } + + public Integer getMateReferenceIndex() { + return getMateReferenceIndex(mHeader); + } + + public void setMateReferenceIndex(final int referenceIndex, final SAMFileHeader header) { + mMateReferenceIndex = referenceIndex; + if (mMateReferenceIndex == -1) { + mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; + } else { + mMateReferenceName = header.getSequence(referenceIndex).getSequenceName(); + } + } + + public void setMateReferenceIndex(final int referenceIndex) { + setMateReferenceIndex(referenceIndex, mHeader); + } + + + public int getAlignmentStart() { + return mAlignmentStart; + } + + public void setAlignmentStart(final int value) { + mAlignmentStart = value; + } + + public int getAlignmentEnd() { + final byte[] readBases = getReadBases(); + if (mAlignmentStart == NO_ALIGNMENT_START || Arrays.equals(NULL_SEQUENCE, readBases) || readBases == null) { + return -1; + } + return mAlignmentStart + getCigar().getReferenceLength() - 1; + } + + /** + * Returns the alignment start adjusted for clipped bases. For example if the read + * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) + * then this method will return 96. + */ + public int getUnclippedStart() { + int pos = getAlignmentStart(); + + for (final CigarElement cig : getCigar().getCigarElements()) { + final CigarOperator op = cig.getOperator(); + if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { + pos -= cig.getLength(); + } + else { + break; + } + } + + return pos; + } + + /** + * Returns the alignment end adjusted for clipped bases. For example if the read + * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) + * then this method will return 107. + */ + public int getUnclippedEnd() { + int pos = getAlignmentEnd(); + List cigs = getCigar().getCigarElements(); + for (int i=cigs.size() - 1; i>=0; --i) { + final CigarElement cig = cigs.get(i); + final CigarOperator op = cig.getOperator(); + + if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { + pos += cig.getLength(); + } + else { + break; + } + } + + return pos; + } + + public void setAlignmentEnd(final int value) { + throw new UnsupportedOperationException("Not supported: setAlignmentEnd"); + } + + public int getMateAlignmentStart() { + return mMateAlignmentStart; + } + + public void setMateAlignmentStart(final int mateAlignmentStart) { + this.mMateAlignmentStart = mateAlignmentStart; + } + + public int getInferredInsertSize() { + return mInferredInsertSize; + } + + public void setInferredInsertSize(final int inferredInsertSize) { + this.mInferredInsertSize = inferredInsertSize; + } + + public int getMappingQuality() { + return mMappingQuality; + } + + public void setMappingQuality(final int value) { + mMappingQuality = value; + } + + public String getCigarString() { + if (mCigarString == null && getCigar() != null) { + mCigarString = TextCigarCodec.getSingleton().encode(getCigar()); + } + return mCigarString; + } + + public void setCigarString(final String value) { + mCigarString = value; + mCigar = null; + } + + public Cigar getCigar() { + if (mCigar == null && mCigarString != null) { + mCigar = TextCigarCodec.getSingleton().decode(mCigarString); + } + return mCigar; + } + + /** + * This method is preferred over getCigar().getNumElements(), because for BAMRecord it may be faster. + * @return number of cigar elements (number + operator) in the cigar string + */ + public int getCigarLength() { + return getCigar().numCigarElements(); + } + + public void setCigar(final Cigar cigar) { + this.mCigar = cigar; + mCigarString = null; + } + + public int getFlags() { + return mFlags; + } + + public void setFlags(final int value) { + mFlags = value; + } + + /** + * the read is paired in sequencing, no matter whether it is mapped in a pair + */ + public boolean getReadPairedFlag() { + return (mFlags & READ_PAIRED_FLAG) != 0; + } + + private void requireReadPaired() { + if (!getReadPairedFlag()) { + throw new IllegalStateException("Inappropriate call if not paired read"); + } + } + + /** + * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) + */ + public boolean getProperPairFlag() { + requireReadPaired(); + return (mFlags & PROPER_PAIR_FLAG) != 0; + } + + /** + * the query sequence itself is unmapped + */ + public boolean getReadUnmappedFlag() { + return (mFlags & READ_UNMAPPED_FLAG) != 0; + } + + /** + * the mate is unmapped + */ + public boolean getMateUnmappedFlag() { + requireReadPaired(); + return (mFlags & MATE_UNMAPPED_FLAG) != 0; + } + + /** + * strand of the query (false for forward; true for reverse strand) + */ + public boolean getReadNegativeStrandFlag() { + return (mFlags & READ_STRAND_FLAG) != 0; + } + + /** + * strand of the mate (false for forward; true for reverse strand) + */ + public boolean getMateNegativeStrandFlag() { + requireReadPaired(); + return (mFlags & MATE_STRAND_FLAG) != 0; + } + + /** + * the read is the first read in a pair + */ + public boolean getFirstOfPairFlag() { + requireReadPaired(); + return (mFlags & FIRST_OF_PAIR_FLAG) != 0; + } + + /** + * the read is the second read in a pair + */ + public boolean getSecondOfPairFlag() { + requireReadPaired(); + return (mFlags & SECOND_OF_PAIR_FLAG) != 0; + } + + /** + * the alignment is not primary (a read having split hits may have multiple primary alignment records) + */ + public boolean getNotPrimaryAlignmentFlag() { + return (mFlags & NOT_PRIMARY_ALIGNMENT_FLAG) != 0; + } + + /** + * the read fails platform/vendor quality checks + */ + public boolean getReadFailsVendorQualityCheckFlag() { + return (mFlags & READ_FAILS_VENDOR_QUALITY_CHECK_FLAG) != 0; + } + + /** + * the read is either a PCR duplicate or an optical duplicate + */ + public boolean getDuplicateReadFlag() { + return (mFlags & DUPLICATE_READ_FLAG) != 0; + } + + /** + * the read is paired in sequencing, no matter whether it is mapped in a pair + */ + public void setReadPairedFlag(final boolean flag) { + setFlag(flag, READ_PAIRED_FLAG); + } + + /** + * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) + */ + public void setProperPairFlag(final boolean flag) { + setFlag(flag, PROPER_PAIR_FLAG); + } + + /** + * the query sequence itself is unmapped + */ + public void setReadUmappedFlag(final boolean flag) { + setFlag(flag, READ_UNMAPPED_FLAG); + } + + /** + * the mate is unmapped + */ + public void setMateUnmappedFlag(final boolean flag) { + setFlag(flag, MATE_UNMAPPED_FLAG); + } + + /** + * strand of the query (false for forward; true for reverse strand) + */ + public void setReadNegativeStrandFlag(final boolean flag) { + setFlag(flag, READ_STRAND_FLAG); + } + + /** + * strand of the mate (false for forward; true for reverse strand) + */ + public void setMateNegativeStrandFlag(final boolean flag) { + setFlag(flag, MATE_STRAND_FLAG); + } + + /** + * the read is the first read in a pair + */ + public void setFirstOfPairFlag(final boolean flag) { + setFlag(flag, FIRST_OF_PAIR_FLAG); + } + + /** + * the read is the second read in a pair + */ + public void setSecondOfPairFlag(final boolean flag) { + setFlag(flag, SECOND_OF_PAIR_FLAG); + } + + /** + * the alignment is not primary (a read having split hits may have multiple primary alignment records) + */ + public void setNotPrimaryAlignmentFlag(final boolean flag) { + setFlag(flag, NOT_PRIMARY_ALIGNMENT_FLAG); + } + + /** + * the read fails platform/vendor quality checks + */ + public void setReadFailsVendorQualityCheckFlag(final boolean flag) { + setFlag(flag, READ_FAILS_VENDOR_QUALITY_CHECK_FLAG); + } + + /** + * the read is either a PCR duplicate or an optical duplicate + */ + public void setDuplicateReadFlag(final boolean flag) { + setFlag(flag, DUPLICATE_READ_FLAG); + } + + private void setFlag(final boolean flag, final int bit) { + if (flag) { + mFlags |= bit; + } else { + mFlags &= ~bit; + } + } + + public Object getAttribute(final String key) { + if (mAttributes == null) { + return null; + } + return mAttributes.get(key); + } + + public void setAttribute(final String key, final Object value) { + if (mAttributes == null) { + mAttributes = new LinkedHashMap(); + } + mAttributes.put(key, value); + } + + public Set> getAttributes() { + if (mAttributes == null) { + return null; + } + return mAttributes.entrySet(); + } + + public Integer getIndexingBin() { + return mIndexingBin; + } + + public void setIndexingBin(final Integer mIndexingBin) { + this.mIndexingBin = mIndexingBin; + } + + public SAMFileHeader getHeader() { + return mHeader; + } + + public void setHeader(final SAMFileHeader mHeader) { + this.mHeader = mHeader; + } + + /** + * If this record has a valid binary representation of the variable-length portion of a binary record stored, + * return that byte array, otherwise return null. This will never be true for SAMRecords. It will be true + * for BAMRecords that have not been eagerDecoded(), and for which none of the data in the variable-length + * portion has been changed. + */ + public byte[] getVariableBinaryRepresentation() { + return null; + } + + /** + * Depending on the concrete implementation, the binary file size of attributes may be known without + * computing them all. + * @return binary file size of attribute, if known, else -1 + */ + public int getAttributesBinarySize() { + return -1; + } + + public String format() { + final StringBuilder buffer = new StringBuilder(); + addField(buffer, getReadName(), null, null); + addField(buffer, getFlags(), null, null); + addField(buffer, getReferenceName(), null, "*"); + addField(buffer, getAlignmentStart(), 0, "*"); + addField(buffer, getMappingQuality(), 0, "0"); + addField(buffer, getCigarString(), null, "*"); + addField(buffer, getMateReferenceName(), null, "*"); + addField(buffer, getMateAlignmentStart(), 0, "*"); + addField(buffer, getInferredInsertSize(), 0, "*"); + addField(buffer, getReadString(), null, "*"); + addField(buffer, getBaseQualityString(), null, "*"); + if (mAttributes != null) { + for (final Map.Entry entry : getAttributes()) { + addField(buffer, formatTagValue(entry.getKey(), entry.getValue())); + } + } + return buffer.toString(); + } + + private void addField(final StringBuilder buffer, final Object value, final Object defaultValue, final String defaultString) { + if (safeEquals(value, defaultValue)) { + addField(buffer, defaultString); + } else if (value == null) { + addField(buffer, ""); + } else { + addField(buffer, value.toString()); + } + } + + private void addField(final StringBuilder buffer, final String field) { + if (buffer.length() > 0) { + buffer.append('\t'); + } + buffer.append(field); + } + + private String formatTagValue(final String key, final Object value) { + if (value == null || value instanceof String) { + return key + ":Z:" + value; + } else if (value instanceof Integer) { + return key + ":i:" + value; + } else if (value instanceof Character) { + return key + ":A:" + value; + } else if (value instanceof Float) { + return key + ":f:" + value; + } else if (value instanceof byte[]) { + return key + ":H:" + SAMUtils.bytesToHexString((byte[]) value); + } else { + throw new RuntimeException("Unexpected value type for key " + key + + ": " + value); + } + } + + private boolean safeEquals(final Object o1, final Object o2) { + if (o1 == o2) { + return true; + } else if (o1 == null || o2 == null) { + return false; + } else { + return o1.equals(o2); + } + } + + /** + * Force all lazily-initialized data members to be initialized. If a subclass overrides this method, + * typically it should also call super method. + */ + protected void eagerDecode() { + getCigar(); + getCigarString(); + } + + /** + * Returns blocks of the read sequence that have been aligned directly to the + * reference sequence. Note that clipped portions of the read and inserted and + * deleted bases (vs. the reference) are not represented in the alignment blocks. + */ + public List getAlignmentBlocks() { + if (this.mAlignmentBlocks != null) return this.mAlignmentBlocks; + + final Cigar cigar = getCigar(); + if (cigar == null) return Collections.emptyList(); + + + this.mAlignmentBlocks = new ArrayList(); + int readBase = 1; + int refBase = getAlignmentStart(); + + for (final CigarElement e : cigar.getCigarElements()) { + switch (e.getOperator()) { + case H : break; // ignore hard clips + case P : break; // ignore pads + case S : readBase += e.getLength(); break; // soft clip read bases + case N : refBase += e.getLength(); break; // reference skip + case D : refBase += e.getLength(); break; + case I : readBase += e.getLength(); break; + case M : + final int length = e.getLength(); + this.mAlignmentBlocks.add(new AlignmentBlock(readBase, refBase, length)); + readBase += length; + refBase += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + } + } + + return this.mAlignmentBlocks; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof SAMRecord)) return false; + + final SAMRecord samRecord = (SAMRecord) o; + eagerDecode(); + samRecord.eagerDecode(); + + if (mAlignmentStart != samRecord.mAlignmentStart) return false; + if (mFlags != samRecord.mFlags) return false; + if (mInferredInsertSize != samRecord.mInferredInsertSize) return false; + if (mMappingQuality != samRecord.mMappingQuality) return false; + if (mMateAlignmentStart != samRecord.mMateAlignmentStart) return false; + if (mAttributes != null ? !mAttributes.equals(samRecord.mAttributes) : samRecord.mAttributes != null) + return false; + if (!Arrays.equals(mBaseQualities, samRecord.mBaseQualities)) return false; + if (mCigar != null ? !mCigar.equals(samRecord.mCigar) : samRecord.mCigar != null) + return false; + if (mIndexingBin != null ? !mIndexingBin.equals(samRecord.mIndexingBin) : samRecord.mIndexingBin != null) + return false; + if (mMateReferenceIndex != null ? !mMateReferenceIndex.equals(samRecord.mMateReferenceIndex) : samRecord.mMateReferenceIndex != null) + return false; + if (mMateReferenceName != null ? !mMateReferenceName.equals(samRecord.mMateReferenceName) : samRecord.mMateReferenceName != null) + return false; + if (!Arrays.equals(mReadBases, samRecord.mReadBases)) return false; + if (mReadName != null ? !mReadName.equals(samRecord.mReadName) : samRecord.mReadName != null) return false; + if (mReferenceIndex != null ? !mReferenceIndex.equals(samRecord.mReferenceIndex) : samRecord.mReferenceIndex != null) + return false; + if (mReferenceName != null ? !mReferenceName.equals(samRecord.mReferenceName) : samRecord.mReferenceName != null) + return false; + + return true; + } + + @Override + public int hashCode() { + eagerDecode(); + int result = mReadName != null ? mReadName.hashCode() : 0; + result = 31 * result + (mReadBases != null ? Arrays.hashCode(mReadBases) : 0); + result = 31 * result + (mBaseQualities != null ? Arrays.hashCode(mBaseQualities) : 0); + result = 31 * result + (mReferenceName != null ? mReferenceName.hashCode() : 0); + result = 31 * result + mAlignmentStart; + result = 31 * result + mMappingQuality; + result = 31 * result + (mCigarString != null ? mCigarString.hashCode() : 0); + result = 31 * result + mFlags; + result = 31 * result + (mMateReferenceName != null ? mMateReferenceName.hashCode() : 0); + result = 31 * result + mMateAlignmentStart; + result = 31 * result + mInferredInsertSize; + result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); + result = 31 * result + (mReferenceIndex != null ? mReferenceIndex.hashCode() : 0); + result = 31 * result + (mMateReferenceIndex != null ? mMateReferenceIndex.hashCode() : 0); + result = 31 * result + (mIndexingBin != null ? mIndexingBin.hashCode() : 0); + return result; + } +} + diff --git a/lib/edu/mit/broad/sam/SAMRecordComparator.java b/lib/edu/mit/broad/sam/SAMRecordComparator.java new file mode 100644 index 0000000000..0a2afd8389 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMRecordComparator.java @@ -0,0 +1,23 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import java.util.Comparator; + +public interface SAMRecordComparator extends Comparator { + + /** + * Less stringent compare method than the regular compare. If the two records + * are equal enough that their ordering in a sorted SAM file would be arbitrary, + * this method returns 0. + * @return + */ + public int fileOrderCompare(SAMRecord samRecord1, SAMRecord samRecord2); +} diff --git a/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java b/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java new file mode 100644 index 0000000000..e195d97088 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java @@ -0,0 +1,58 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +/** + * Comparator for sorting SAMRecords by coordinate. Note that the header is required because + * the order of sequences in the header defines the major sort order. + */ +public class SAMRecordCoordinateComparator implements SAMRecordComparator { + private final SAMFileHeader header; + public SAMRecordCoordinateComparator(final SAMFileHeader header) { + this.header = header; + } + public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { + final int cmp = fileOrderCompare(samRecord1, samRecord2); + if (cmp != 0) { + return cmp; + } + if (samRecord1.getReadNegativeStrandFlag() == samRecord2.getReadNegativeStrandFlag()) { + return samRecord1.getReadName().compareTo(samRecord2.getReadName()); + } + else { + return (samRecord1.getReadNegativeStrandFlag()? 1: -1); + } + + + + } + + /** + * Less stringent compare method than the regular compare. If the two records + * are equal enough that their ordering in a sorted SAM file would be arbitrary, + * this method returns 0. If read is paired and unmapped, use the mate mapping to sort. + * + * @return + */ + public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { + int refIndex1 = samRecord1.getReferenceIndex(header); + int refIndex2 = samRecord2.getReferenceIndex(header); + if (refIndex1 == -1) { + return (refIndex2 == -1? 0: 1); + } else if (refIndex2 == -1) { + return -1; + } + int cmp = refIndex1 - refIndex2; + if (cmp != 0) { + return cmp; + } + return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); + } +} diff --git a/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java b/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java new file mode 100644 index 0000000000..3318488b13 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java @@ -0,0 +1,38 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +/** + * For "queryname" ordering of SAMRecords + */ +public class SAMRecordQueryNameComparator implements SAMRecordComparator { + + public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { + final int cmp = fileOrderCompare(samRecord1, samRecord2); + if (cmp != 0) { + return cmp; + } + if (samRecord1.getReadNegativeStrandFlag() == samRecord2.getReadNegativeStrandFlag()) { + return 0; + } + return (samRecord1.getReadNegativeStrandFlag()? 1: -1); + } + + /** + * Less stringent compare method than the regular compare. If the two records + * are equal enough that their ordering in a sorted SAM file would be arbitrary, + * this method returns 0. + * + * @return + */ + public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { + return samRecord1.getReadName().compareTo(samRecord2.getReadName()); + } +} diff --git a/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java b/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java new file mode 100644 index 0000000000..6e6e2714fd --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java @@ -0,0 +1,274 @@ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.CoordMath; +import edu.mit.broad.sam.util.RuntimeIOException; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * Factory class for creating SAMRecords for testing purposes. Various methods can be called + * to add new SAM records (or pairs of records) to a list which can then be returned at + * any point. The records must reference human chromosomes (excluding randoms etc.). + * + * Although this is a class for testing, it is in the src tree because it is included in the sam jarfile. + * + * @author Tim Fennell + */ +public class SAMRecordSetBuilder implements Iterable { + private static final String[] chroms = { + "chrM", "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", + "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", + "chr21", "chr22", "chrX", "chrY" + }; + private static final byte[] BASES = {'A','C','G','T'}; + private static final String READ_GROUP_ID = "1"; + private static final String SAMPLE = "FREE_SAMPLE"; + private final Random random = new Random(); + + private SAMFileHeader header; + private Collection records; + + private final int readLength = 36 ; + + private SAMProgramRecord programRecord = null; + + + /** + * Constructs a new SAMRecordSetBuilder with all the data needed to keep the records + * sorted in coordinate order. + */ + public SAMRecordSetBuilder() { + this(true, SAMFileHeader.SortOrder.coordinate); + } + + public SAMRecordSetBuilder(final boolean sortForMe, final SAMFileHeader.SortOrder sortOrder) { + final List sequences = new ArrayList(); + for (final String chrom : chroms) { + sequences.add(new SAMSequenceRecord(chrom)); + } + + this.header = new SAMFileHeader(); + this.header.setSequences(sequences); + this.header.setSortOrder(sortOrder); + if (sortForMe) { + final SAMRecordComparator comparator; + if (sortOrder == SAMFileHeader.SortOrder.queryname) { + comparator = new SAMRecordQueryNameComparator(); + } else { + comparator = new SAMRecordCoordinateComparator(header); + } + this.records = new TreeSet(comparator); + } else { + this.records = new ArrayList(); + } + final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(READ_GROUP_ID); + readGroupRecord.setSample(SAMPLE); + final List readGroups = new ArrayList(); + readGroups.add(readGroupRecord); + this.header.setReadGroups(readGroups); + } + + /** + * Adds the given program record to the header, and assigns the PG tag to any SAMRecords + * created after it has been added. May be called multiple times in order to assign different + * PG IDs to different SAMRecords. programRecord may be null to stop assignment of PG tag. + * It is up to the caller to ensure that program record IDs do not collide. + */ + public void setProgramRecord(SAMProgramRecord programRecord) { + this.programRecord = programRecord; + if (programRecord != null) { + this.header.addProgramRecord(programRecord); + } + } + + /** Returns the accumulated list of sam records. */ + public Collection getRecords() { return this.records; } + + /** Returns a CloseableIterator over the collection of SAMRecords. */ + public CloseableIterator iterator() { + return new CloseableIterator() { + private final Iterator iterator = records.iterator(); + public void close() { /** Do nothing. */ } + public boolean hasNext() { return this.iterator.hasNext(); } + public SAMRecord next() { return this.iterator.next(); } + public void remove() { this.iterator.remove(); } + }; + } + + /** + * Adds a skeletal fragment (non-PE) record to the set using the provided + * contig start and strand information. + */ + public void addFrag(final String name, final int contig, final int start, final boolean negativeStrand) { + final SAMRecord rec = new SAMRecord(); + rec.setReadName(name); + rec.setReferenceIndex(contig, this.header); + rec.setReferenceName(chroms[contig]); + rec.setAlignmentStart(start); + rec.setReadNegativeStrandFlag(negativeStrand); + rec.setCigarString(readLength + "M"); + rec.setMappingQuality(255); + rec.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); + if (programRecord != null) { + rec.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); + } + + fillInBasesAndQualities(rec); + this.records.add(rec); + } + + /** Adds an unmapped fragment read to the builder. */ + public void addUnmappedFragment(final String name) { + final SAMRecord rec = new SAMRecord(); + rec.setReadName(name); + rec.setReadUmappedFlag(true); + rec.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); + if (programRecord != null) { + rec.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); + } + fillInBasesAndQualities(rec); + this.records.add(rec); + } + + /** + * Adds a skeletal fragment (non-PE) record to the set using the provided + * contig start and strand information. The pair is assumed to be a well + * formed pair sitting on a single contig. + */ + public void addPair(final String name, final int contig, final int start1, final int start2) { + final SAMRecord end1 = new SAMRecord(); + final SAMRecord end2 = new SAMRecord(); + final boolean end1IsFirstOfPair = this.random.nextBoolean(); + + end1.setReadName(name); + end1.setReferenceIndex(contig, this.header); + end1.setAlignmentStart(start1); + end1.setReadNegativeStrandFlag(false); + end1.setCigarString(readLength + "M"); + end1.setMappingQuality(255); + end1.setReadPairedFlag(true); + end1.setProperPairFlag(true); + end1.setMateReferenceIndex(contig, this.header); + end1.setMateAlignmentStart(start2); + end1.setMateNegativeStrandFlag(true); + end1.setFirstOfPairFlag(end1IsFirstOfPair); + end1.setSecondOfPairFlag(!end1IsFirstOfPair); + end1.setInferredInsertSize((int) CoordMath.getLength(start1, CoordMath.getEnd(start2, this.readLength))); + end1.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); + if (programRecord != null) { + end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); + } + fillInBasesAndQualities(end1); + + end2.setReadName(name); + end2.setReferenceIndex(contig, this.header); + end2.setAlignmentStart(start2); + end2.setReadNegativeStrandFlag(true); + end2.setCigarString(readLength + "M"); + end2.setMappingQuality(255); + end2.setReadPairedFlag(true); + end2.setProperPairFlag(true); + end2.setMateReferenceIndex(contig, this.header); + end2.setMateAlignmentStart(start1); + end2.setMateNegativeStrandFlag(false); + end2.setFirstOfPairFlag(!end1IsFirstOfPair); + end2.setSecondOfPairFlag(end1IsFirstOfPair); + end2.setInferredInsertSize(end1.getInferredInsertSize()); + end2.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); + if (programRecord != null) { + end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); + } + fillInBasesAndQualities(end2); + + this.records.add(end1); + this.records.add(end2); + } + + /** Adds a pair with both ends unmapped to the builder. */ + public void addUnmappedPair(final String name) { + final SAMRecord end1 = new SAMRecord(); + final SAMRecord end2 = new SAMRecord(); + final boolean end1IsFirstOfPair = this.random.nextBoolean(); + + end1.setReadName(name); + end1.setReadPairedFlag(false); + end1.setReadUmappedFlag(true); + end1.setProperPairFlag(false); + end1.setFirstOfPairFlag(end1IsFirstOfPair); + end1.setSecondOfPairFlag(!end1IsFirstOfPair); + end1.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); + if (programRecord != null) { + end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); + } + fillInBasesAndQualities(end1); + + end2.setReadName(name); + end2.setReadPairedFlag(false); + end2.setReadUmappedFlag(true); + end2.setProperPairFlag(false); + end2.setFirstOfPairFlag(!end1IsFirstOfPair); + end2.setSecondOfPairFlag(end1IsFirstOfPair); + end2.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); + if (programRecord != null) { + end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); + } + fillInBasesAndQualities(end2); + + this.records.add(end1); + this.records.add(end2); + } + + /** + * Fills in bases and qualities with randomly generated data. + * Relies on the alignment start and end having been set to get read length. + */ + private void fillInBasesAndQualities(final SAMRecord rec) { + final int length = this.readLength; + final byte[] bases = new byte[length]; + final byte[] quals = new byte[length]; + + for (int i=0; i mAttributes = null; + public static final String SEQUENCE_NAME_TAG = "SN"; + public static final String SEQUENCE_LENGTH_TAG = "LN"; + public static final String MD5_TAG = "M5"; + public static final String ASSEMBLY_TAG = "AS"; + public static final String URI_TAG = "UR"; + public static final String SPECIES_TAG = "SP"; + + public SAMSequenceRecord(final String name) { + mSequenceName = name; + } + + public String getSequenceName() { + return mSequenceName; + } + + public int getSequenceLength() { + return mSequenceLength; + } + + public void setSequenceLength(final int value) { + mSequenceLength = value; + } + + public String getAssembly() { + return (String) getAttribute("AS"); + } + + public void setAssembly(final String value) { + setAttribute("AS", value); + } + + public String getSpecies() { + return (String) getAttribute("SP"); + } + + public void setSpecies(final String value) { + setAttribute("SP", value); + } + + public Object getAttribute(final String key) { + if (mAttributes == null) { + return null; + } + return mAttributes.get(key); + } + + public void setAttribute(final String key, final Object value) { + if (mAttributes == null) { + mAttributes = new HashMap(); + } + mAttributes.put(key, value); + } + + public Set> getAttributes() { + if (mAttributes == null) { + return null; + } + return mAttributes.entrySet(); + } + + // Private state used only by SAM implementation. + int getSequenceIndex() { + return mSequenceIndex; + } + + // Private state used only by SAM implementation. + void setSequenceIndex(final int value) { + mSequenceIndex = value; + } + + /** + * Looser comparison than equals(). If one SAMSequenceRecord has an attribute that the other does not + * have, that is not considered inequality. However, if they both have an attribute, but have different + * values for that atttribute, then they are considered unequal. This results in an intransitive equality test, + * i.e. a.isSameSequence(b) && b.isSameSequence(c) does not necessarily imply a.isSameSequence(c) + */ + public boolean isSameSequence(final SAMSequenceRecord that) { + if (this == that) return true; + if (that == null) return false; + + if (mSequenceIndex != that.mSequenceIndex) return false; + if (mSequenceLength != that.mSequenceLength) return false; + if (mSequenceName != null ? !mSequenceName.equals(that.mSequenceName) : that.mSequenceName != null) + return false; + // If one record has an optional attribute and the other does not, that is not considered inequality. + + if (mAttributes != null) { + for (final Map.Entry entry: getAttributes()) { + final Object thatAttribute = that.getAttribute(entry.getKey()); + if (thatAttribute != null && !entry.getValue().equals(thatAttribute)) { + return false; + } + } + } + + return true; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof SAMSequenceRecord)) return false; + + final SAMSequenceRecord that = (SAMSequenceRecord) o; + + if (mSequenceIndex != that.mSequenceIndex) return false; + if (mSequenceLength != that.mSequenceLength) return false; + if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; + if (mSequenceName != null ? !mSequenceName.equals(that.mSequenceName) : that.mSequenceName != null) + return false; + + return true; + } + + @Override + public int hashCode() { + int result = mSequenceName != null ? mSequenceName.hashCode() : 0; + result = 31 * result + mSequenceIndex; + result = 31 * result + mSequenceLength; + result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); + return result; + } +} + diff --git a/lib/edu/mit/broad/sam/SAMTag.java b/lib/edu/mit/broad/sam/SAMTag.java new file mode 100644 index 0000000000..5189782cc0 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMTag.java @@ -0,0 +1,16 @@ +package edu.mit.broad.sam;/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +/** + * The standard tags defined in the SAM spec + */ +public enum SAMTag { + RG, LB, PU, PG, AS, SQ, MQ, NM, H0, H1, H2, UQ, PQ, NH, IH, HI, MD, CS, CQ, CM, R2, Q2, S2, CC, CP, SM, AM, MF +} diff --git a/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java b/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java new file mode 100644 index 0000000000..202f5f5bf5 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java @@ -0,0 +1,323 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.LineReader; +import edu.mit.broad.sam.util.RuntimeIOException; +import edu.mit.broad.sam.util.StringUtil; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This is actually two classes in one (not sure if that is a good idea) -- a parser + * for a SAM text header, and a generator of SAM text header. + */ +public class SAMTextHeaderCodec { + private static final String HEADER_LINE_START = "@"; + + // These attributes are populated when parsing or generating + private SAMFileHeader mFileHeader; + + // These attributes are populated when parsing text + private String mCurrentLine; + private LineReader mReader; + private File mFile; + private List sequences; + private List readGroups; + + // These attributes are populated when generating text + private BufferedWriter writer; + + private static final String TAG_KEY_VALUE_SEPARATOR = ":"; + private static final String FIELD_SEPARATOR = "\t"; + + public SAMTextHeaderCodec() { + } + + /** + * Reads text and converts to a SAMFileHeader object. Note that one line past + * the header must be read in order to determine the end of the header. This line can be + * obtained after parseTextHeader() has returned by calling getCurrentLine() + * @param reader Where to get header text from. + * @param file Name of the input file, for error messages. May be null. + * @return complete header object. + */ + public SAMFileHeader decode(final LineReader reader, final File file) { + mFileHeader = new SAMFileHeader(); + mReader = reader; + mFile = file; + sequences = new ArrayList(); + readGroups = new ArrayList(); + + while (advanceLine() != null) { + if (!mCurrentLine.startsWith(HEADER_LINE_START)) { + break; + } + final ParsedHeaderLine parsedHeaderLine = new ParsedHeaderLine(mCurrentLine); + switch (parsedHeaderLine.getHeaderRecordType()) { + + case HD: + parseHDLine(parsedHeaderLine); + break; + case PG: + parsePGLine(parsedHeaderLine); + break; + case RG: + parseRGLine(parsedHeaderLine); + break; + case SQ: + parseSQLine(parsedHeaderLine); + break; + default: + throw new IllegalStateException("Unrecognized header record type: " + + parsedHeaderLine.getHeaderRecordType()); + } + } + mFileHeader.setSequences(sequences); + mFileHeader.setReadGroups(readGroups); + return mFileHeader; + } + + private String advanceLine() { + mCurrentLine = mReader.readLine(); + return mCurrentLine; + } + + private void parsePGLine(final ParsedHeaderLine parsedHeaderLine) { + assert(HeaderRecordType.PG.equals(parsedHeaderLine.getHeaderRecordType())); + parsedHeaderLine.requireTag(SAMProgramRecord.PROGRAM_GROUP_ID_TAG); + final SAMProgramRecord programRecord = new SAMProgramRecord(parsedHeaderLine.removeValue(SAMProgramRecord.PROGRAM_GROUP_ID_TAG)); + for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { + programRecord.setAttribute(entry.getKey(), entry.getValue()); + } + mFileHeader.addProgramRecord(programRecord); + } + + private void parseRGLine(final ParsedHeaderLine parsedHeaderLine) { + assert(HeaderRecordType.RG.equals(parsedHeaderLine.getHeaderRecordType())); + parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_ID_TAG); + parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG); + final SAMReadGroupRecord samReadGroupRecord = new SAMReadGroupRecord(parsedHeaderLine.removeValue(SAMReadGroupRecord.READ_GROUP_ID_TAG)); + for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { + samReadGroupRecord.setAttribute(entry.getKey(), entry.getValue()); + } + + // Convert non-String attributes to the appropriate types + final String predictedMedianInsertSize = + (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG); + if (predictedMedianInsertSize != null) { + try { + samReadGroupRecord.setAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG, + Integer.parseInt(predictedMedianInsertSize)); + } catch (NumberFormatException e) { + throw new SAMFormatException(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG + + " is not numeric: " + predictedMedianInsertSize, e); + } + } + +/* +TODO: Need an ISO 6801 date parser + String dateRunProduced = (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG); + if (dateRunProduced != null) { + try { + Date date = dateParser.parse(dateRunProduced); + samReadGroupRecord.setAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, date); + } catch (ParseException e) { + throw new SAMFormatException(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG + " cannot be parsed as a date: " + + dateRunProduced, e); + } + } +*/ + + readGroups.add(samReadGroupRecord); + } + + private void parseSQLine(final ParsedHeaderLine parsedHeaderLine) { + assert(HeaderRecordType.SQ.equals(parsedHeaderLine.getHeaderRecordType())); + parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_NAME_TAG); + parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_LENGTH_TAG); + final SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord(parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_NAME_TAG)); + samSequenceRecord.setSequenceLength(Integer.parseInt(parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_LENGTH_TAG))); + for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { + samSequenceRecord.setAttribute(entry.getKey(), entry.getValue()); + } + sequences.add(samSequenceRecord); + } + + private void parseHDLine(final ParsedHeaderLine parsedHeaderLine) { + assert(HeaderRecordType.HD.equals(parsedHeaderLine.getHeaderRecordType())); + parsedHeaderLine.requireTag(SAMFileHeader.VERSION_TAG); + for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { + mFileHeader.setAttribute(entry.getKey(), entry.getValue()); + } + } + + private RuntimeException reportErrorParsingLine(final String reason) { + String fileMessage = ""; + if (mFile != null) { + fileMessage = "File " + mFile + "; "; + } + return new SAMFormatException("Error parsing text SAM file. " + reason + "; " + fileMessage + + "Line " + mReader.getLineNumber() + "\nLine: " + mCurrentLine); + } + + private enum HeaderRecordType { + HD, SQ, RG, PG + } + + private class ParsedHeaderLine { + private final HeaderRecordType mHeaderRecordType; + private final Map mKeyValuePairs = new HashMap(); + + ParsedHeaderLine(final String line) { + assert(line.startsWith(HEADER_LINE_START)); + final String[] fields = line.split(FIELD_SEPARATOR); + try { + mHeaderRecordType = HeaderRecordType.valueOf(fields[0].substring(1)); + } catch (IllegalArgumentException e) { + throw reportErrorParsingLine("Unrecognized header record type"); + } + for (int i = 1; i < fields.length; ++i) { + final String[] keyAndValue = fields[i].split(TAG_KEY_VALUE_SEPARATOR, 2); + if (keyAndValue.length != 2) { + throw reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType + + " key:value pair"); + } + mKeyValuePairs.put(keyAndValue[0], keyAndValue[1]); + } + } + + void requireTag(final String tag) { + if (!mKeyValuePairs.containsKey(tag)) { + throw reportErrorParsingLine(HEADER_LINE_START + mHeaderRecordType + " line missing " + tag + " tag"); + } + } + + public HeaderRecordType getHeaderRecordType() { + return mHeaderRecordType; + } + + boolean containsKey(final String key) { + return mKeyValuePairs.containsKey(key); + } + + String getValue(final String key) { + return mKeyValuePairs.get(key); + } + + String removeValue(final String key) { + final String ret = mKeyValuePairs.get(key); + mKeyValuePairs.remove(key); + return ret; + } + + } + + /** + * After parsing the text header, this object has gobbled one line too many. Call this to get that line. + * @return the first non-header line, or null if there isn't one. + */ + public String getCurrentLine() { + return mCurrentLine; + } + + /** + * + * @param writer where to write the header text + * @param header object to be converted to text. + */ + public void encode(final Writer writer, final SAMFileHeader header) { + mFileHeader = header; + this.writer = new BufferedWriter(writer); + writeHDLine(); + for (final SAMSequenceRecord sequenceRecord: header.getSequences()) { + writeSQLine(sequenceRecord); + } + + for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { + writeRGLine(readGroup); + } + for (final SAMProgramRecord programRecord : header.getProgramRecords()) { + writePGLine(programRecord); + } + try { + this.writer.flush(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private void println(final String s) { + try { + writer.append(s); + writer.append("\n"); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private void writePGLine(SAMProgramRecord programRecord) { + if (programRecord == null) { + return; + } + final String[] fields = new String[2 + programRecord.getAttributes().size()]; + fields[0] = HEADER_LINE_START + HeaderRecordType.PG; + fields[1] = SAMProgramRecord.PROGRAM_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + programRecord.getProgramGroupId(); + int i = 2; + for (final Map.Entry entry: programRecord.getAttributes()) { + fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue(); + } + println(StringUtil.join(FIELD_SEPARATOR, fields)); + } + + private void writeRGLine(final SAMReadGroupRecord readGroup) { + final String[] fields = new String[2 + readGroup.getAttributes().size()]; + fields[0] = HEADER_LINE_START + HeaderRecordType.RG; + fields[1] = SAMReadGroupRecord.READ_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + readGroup.getReadGroupId(); + int i = 2; + for (final Map.Entry entry: readGroup.getAttributes()) { + fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue().toString(); + } + println(StringUtil.join(FIELD_SEPARATOR, fields)); + } + + private void writeHDLine() { + final String[] fields = new String[1 + mFileHeader.getAttributes().size()]; + fields[0] = HEADER_LINE_START + HeaderRecordType.HD; + int i = 1; + for (final Map.Entry entry: mFileHeader.getAttributes()) { + fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue().toString(); + } + println(StringUtil.join(FIELD_SEPARATOR, fields)); + } + + private void writeSQLine(final SAMSequenceRecord sequenceRecord) { + final int numAttributes =sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0; + final String[] fields = new String[3 + numAttributes]; + fields[0] = HEADER_LINE_START + HeaderRecordType.SQ; + fields[1] = SAMSequenceRecord.SEQUENCE_NAME_TAG + TAG_KEY_VALUE_SEPARATOR + sequenceRecord.getSequenceName(); + fields[2] = SAMSequenceRecord.SEQUENCE_LENGTH_TAG + TAG_KEY_VALUE_SEPARATOR + Integer.toString(sequenceRecord.getSequenceLength()); + int i = 3; + if (sequenceRecord.getAttributes() != null) { + for (final Map.Entry entry: sequenceRecord.getAttributes()) { + fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue().toString(); + } + } + println(StringUtil.join(FIELD_SEPARATOR, fields)); + } + +} diff --git a/lib/edu/mit/broad/sam/SAMTextReader.java b/lib/edu/mit/broad/sam/SAMTextReader.java new file mode 100644 index 0000000000..267f704616 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMTextReader.java @@ -0,0 +1,336 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import edu.mit.broad.sam.util.AsciiLineReader; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.StringUtil; + +import java.io.File; +import java.io.InputStream; +import java.util.Map; + +/** + * Internal class for reading SAM text files. + */ +class SAMTextReader + extends SAMFileReader.ReaderImplementation +{ + private static final int QNAME_COL = 0; + private static final int FLAG_COL = 1; + private static final int RNAME_COL = 2; + private static final int POS_COL = 3; + private static final int MAPQ_COL = 4; + private static final int CIGAR_COL = 5; + private static final int MRNM_COL = 6; + private static final int MPOS_COL = 7; + private static final int ISIZE_COL = 8; + private static final int SEQ_COL = 9; + private static final int QUAL_COL = 10; + + private static final int NUM_REQUIRED_FIELDS = 11; + + private AsciiLineReader mReader; + private SAMFileHeader mFileHeader = null; + private String mCurrentLine = null; + private RecordIterator mIterator = null; + private File mFile = null; + private final TextTagCodec tagCodec = new TextTagCodec(); + private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY; + + SAMTextReader(final InputStream stream) { + mReader = new AsciiLineReader(stream); + readHeader(); + } + + SAMTextReader(final InputStream stream, final File file) { + this(stream); + mFile = file; + } + + void close() { + if (mReader != null) { + try { + mReader.close(); + } finally { + mReader = null; + } + } + } + + SAMFileHeader getFileHeader() { + return mFileHeader; + } + + public SAMFileReader.ValidationStringency getValidationStringency() { + return validationStringency; + } + + public void setValidationStringency(final SAMFileReader.ValidationStringency lenientValidation) { + this.validationStringency = lenientValidation; + } + + CloseableIterator getIterator() { + if (mReader == null) { + throw new IllegalStateException("File reader is closed"); + } + if (mIterator != null) { + throw new IllegalStateException("Iteration in progress"); + } + mIterator = new RecordIterator(); + return mIterator; + } + + CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + throw new UnsupportedOperationException("Cannot query SAM text files"); + } + + private void readHeader() { + final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); + mFileHeader = headerCodec.decode(mReader, mFile); + mCurrentLine = headerCodec.getCurrentLine(); + } + + private String advanceLine() { + mCurrentLine = mReader.readLine(); + return mCurrentLine; + } + + private String makeErrorString(final String reason) { + String fileMessage = ""; + if (mFile != null) { + fileMessage = "File " + mFile + "; "; + } + return "Error parsing text SAM file. " + reason + "; " + fileMessage + + "Line " + mReader.getLineNumber() + "\nLine: " + mCurrentLine; + } + + private RuntimeException reportFatalErrorParsingLine(final String reason) { + return new SAMFormatException(makeErrorString(reason)); + } + + private void reportErrorParsingLine(final String reason) { + final String errorMessage = makeErrorString(reason); + + if (validationStringency == SAMFileReader.ValidationStringency.STRICT) { + throw new SAMFormatException(errorMessage); + } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) { + System.err.println("Ignoring SAM validation error due to lenient parsing:"); + System.err.println(errorMessage); + } + } + + private void reportErrorParsingLine(final Exception e) { + final String errorMessage = makeErrorString(e.getMessage()); + if (validationStringency == SAMFileReader.ValidationStringency.STRICT) { + throw new SAMFormatException(errorMessage); + } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) { + System.err.println("Ignoring SAM validation error due to lenient parsing:"); + System.err.println(errorMessage); + } + } + + private class RecordIterator implements CloseableIterator { + + /** + * Allocate this once rather than for every line as a performance optimization. + * The size is arbitrary -- merely large enough to handle the maximum number + * of fields we might expect from a reasonable SAM file. + */ + private final String[] mFields = new String[10000]; + + private SAMRecord mCurrentRecord; + + private RecordIterator() { + assert(mReader != null); + if (mCurrentLine != null) { + parseLine(); + } + + } + + public void close() { + mCurrentRecord = null; + SAMTextReader.this.close(); + } + + public boolean hasNext() { + return mCurrentRecord != null; + } + + public SAMRecord next() { + if (!hasNext()) { + throw new IllegalStateException("Cannot call next() on exhausted iterator"); + } + final SAMRecord ret = mCurrentRecord; + mCurrentRecord = null; + advanceLine(); + if (mCurrentLine != null) { + parseLine(); + } + return ret; + } + + public void remove() { + throw new UnsupportedOperationException("Not supported: remove"); + } + + int parseInt(final String s, final String fieldName) { + final int ret; + try { + ret = Integer.parseInt(s); + } catch (NumberFormatException e) { + throw reportFatalErrorParsingLine("Non-numeric value in " + fieldName + " column"); + } + return ret; + } + + void validateReferenceName(final String rname, final String fieldName) { + if (fieldName.equals("MRNM") && rname.equals("=")) { + return; + } + if (getFileHeader().getSequences().size() != 0) { + if (getFileHeader().getSequence(rname) == null) { + reportErrorParsingLine(fieldName + " '" + rname + "' not found in any SQ record"); + } + } + } + + private void parseLine() { + final int numFields = StringUtil.split(mCurrentLine, mFields, '\t'); + if (numFields < NUM_REQUIRED_FIELDS) { + reportErrorParsingLine("Not enough fields"); + } + if (numFields == mFields.length) { + reportErrorParsingLine("Too many fields in SAM text record."); + } + for (int i = 0; i < numFields; ++i) { + if (mFields[i].length() == 0) { + reportErrorParsingLine("Empty field at position " + i + " (zero-based)"); + } + } + mCurrentRecord = new SAMRecord(); + mCurrentRecord.setReadName(mFields[QNAME_COL]); + + final int flags = parseInt(mFields[FLAG_COL], "FLAG"); + mCurrentRecord.setFlags(flags); + + final String rname = mFields[RNAME_COL]; + if (!rname.equals("*")) { + validateReferenceName(rname, "RNAME"); + mCurrentRecord.setReferenceName(rname); + } else if (!mCurrentRecord.getReadUnmappedFlag()) { + reportErrorParsingLine("RNAME is not specified but flags indicate mapped"); + } + + final int pos = parseInt(mFields[POS_COL], "POS"); + final int mapq = parseInt(mFields[MAPQ_COL], "MAPQ"); + final String cigar = mFields[CIGAR_COL]; + if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(mCurrentRecord.getReferenceName())) { + if (pos == 0) { + reportErrorParsingLine("POS must be non-zero if RNAME is specified"); + } + if (!mCurrentRecord.getReadUnmappedFlag() && cigar.equals("*")) { + reportErrorParsingLine("CIGAR must not be '*' if RNAME is specified"); + } + } else { + if (pos != 0) { + reportErrorParsingLine("POS must be zero if RNAME is not specified"); + } + if (mapq != 0) { + reportErrorParsingLine("MAPQ must be zero if RNAME is not specified"); + } + if (!cigar.equals("*")) { + reportErrorParsingLine("CIGAR must be '*' if RNAME is not specified"); + } + } + mCurrentRecord.setAlignmentStart(pos); + mCurrentRecord.setMappingQuality(mapq); + mCurrentRecord.setCigarString(cigar); + + final String mateRName = mFields[MRNM_COL]; + if (mateRName.equals("*")) { + if (mCurrentRecord.getReadPairedFlag() && !mCurrentRecord.getMateUnmappedFlag()) { + reportErrorParsingLine("MRNM not specified but flags indicate mate mapped"); + } + } + else { + if (!mCurrentRecord.getReadPairedFlag()) { + reportErrorParsingLine("MRNM specified but flags indicate unpaired"); + } + if (mCurrentRecord.getMateUnmappedFlag()) { + reportErrorParsingLine("MRNM specified but flags indicate mate unmapped"); + } + + validateReferenceName(mateRName, "MRNM"); + if (mateRName.equals("=")) { + if (mCurrentRecord.getReferenceName() == null) { + reportErrorParsingLine("MRNM is '=', but RNAME is not set"); + } + mCurrentRecord.setMateReferenceName(mCurrentRecord.getReferenceName()); + } else { + mCurrentRecord.setMateReferenceName(mateRName); + } + } + + final int matePos = parseInt(mFields[MPOS_COL], "MPOS"); + final int isize = parseInt(mFields[ISIZE_COL], "ISIZE"); + if (!mCurrentRecord.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { + if (matePos == 0) { + reportErrorParsingLine("MPOS must be non-zero if MRNM is specified"); + } + if (isize == 0 && mCurrentRecord.getReferenceName().equals(mCurrentRecord.getMateReferenceName())) { + reportErrorParsingLine("ISIZE must be non-zero if RNAME == MRNM"); + } + } else { + if (matePos != 0) { + reportErrorParsingLine("MPOS must be zero if MRNM is not specified"); + } + if (isize != 0) { + reportErrorParsingLine("ISIZE must be zero if MRNM is not specified"); + } + } + mCurrentRecord.setMateAlignmentStart(matePos); + mCurrentRecord.setInferredInsertSize(isize); + if (!mFields[SEQ_COL].equals("*")) { + mCurrentRecord.setReadString(mFields[SEQ_COL]); + } + if (!mFields[QUAL_COL].equals("*")) { + if (mCurrentRecord.getReadString() == null) { + reportErrorParsingLine("QUAL should not be specified if SEQ is not specified"); + } + if (mCurrentRecord.getReadString().length() != mFields[QUAL_COL].length()) { + reportErrorParsingLine("length(QUAL) != length(SEQ)"); + } + mCurrentRecord.setBaseQualityString(mFields[QUAL_COL]); + } + + for (int i = NUM_REQUIRED_FIELDS; i < numFields; ++i) { + parseTag(mFields[i]); + } + + } + + private void parseTag(final String tag) { + Map.Entry entry = null; + try { + entry = tagCodec.decode(tag); + } catch (SAMFormatException e) { + reportErrorParsingLine(e); + } + if (entry != null) { + mCurrentRecord.setAttribute(entry.getKey(), entry.getValue()); + } + } + } +} + diff --git a/lib/edu/mit/broad/sam/SAMTextWriter.java b/lib/edu/mit/broad/sam/SAMTextWriter.java new file mode 100644 index 0000000000..e3e8e65727 --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMTextWriter.java @@ -0,0 +1,121 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.AsciiWriter; +import edu.mit.broad.sam.util.RuntimeIOException; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.Writer; +import java.util.Map; + +class SAMTextWriter extends SAMFileWriterImpl { + private static final String FIELD_SEPARATOR = "\t"; + + private final Writer out; + private final File file; + private final TextTagCodec tagCodec = new TextTagCodec(); + + SAMTextWriter(final File file) { + try { + this.file = file; + this.out = new AsciiWriter(new FileOutputStream(file)); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Writes the record to disk. Sort order has been taken care of by the time + * this method is called. + * + * @param alignment + */ + protected void writeAlignment(final SAMRecord alignment) { + try { + out.write(alignment.getReadName()); + out.write(FIELD_SEPARATOR); + out.write(Integer.toString(alignment.getFlags())); + out.write(FIELD_SEPARATOR); + out.write(alignment.getReferenceName()); + out.write(FIELD_SEPARATOR); + out.write(Integer.toString(alignment.getAlignmentStart())); + out.write(FIELD_SEPARATOR); + out.write(Integer.toString(alignment.getMappingQuality())); + out.write(FIELD_SEPARATOR); + out.write(alignment.getCigarString()); + out.write(FIELD_SEPARATOR); + + // I think == is OK here. If not, it isn't an error, just less efficient storage + if (alignment.getReferenceName() == alignment.getMateReferenceName() && + SAMRecord.NO_ALIGNMENT_REFERENCE_NAME != alignment.getReferenceName()) { + out.write("="); + } else { + out.write(alignment.getMateReferenceName()); + } + out.write(FIELD_SEPARATOR); + out.write(Integer.toString(alignment.getMateAlignmentStart())); + out.write(FIELD_SEPARATOR); + out.write(Integer.toString(alignment.getInferredInsertSize())); + out.write(FIELD_SEPARATOR); + out.write(alignment.getReadString()); + out.write(FIELD_SEPARATOR); + out.write(alignment.getBaseQualityString()); + if (alignment.getAttributes() != null) { + for (final Map.Entry attribute : alignment.getAttributes()) { + out.write(FIELD_SEPARATOR); + out.write(tagCodec.encode(attribute.getKey(), attribute.getValue())); + } + } + out.write("\n"); + + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Write the header to disk. Header object is available via getHeader(). + * + * @param textHeader for convenience if the implementation needs it. + */ + protected void writeHeader(final String textHeader) { + try { + out.write(textHeader); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Do any required flushing here. + */ + protected void finish() { + try { + out.close(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * For producing error messages. + * + * @return Output filename, or null if there isn't one. + */ + protected String getFilename() { + if (file == null) { + return null; + } + return file.getAbsolutePath(); + } +} diff --git a/lib/edu/mit/broad/sam/SAMTools.java b/lib/edu/mit/broad/sam/SAMTools.java new file mode 100644 index 0000000000..0a320ba84a --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMTools.java @@ -0,0 +1,106 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +import edu.mit.broad.sam.util.CloseableIterator; +import java.io.*; + + +/** + * Command line utility for manipulating SAM/BAM files. + */ +public class SAMTools +{ + private String mCommand = null; + private File mInputFile = null; + + + public static void main(final String[] args) + throws Exception { + final int status = new SAMTools().run(args); + if (status != 0) { + System.exit(status); + } + } + + private SAMTools() { + } + + private void usage() { + System.out.println(); + System.out.println("SAMTools version 0.1.0"); + System.out.println("Tools for manipulating SAM/BAM files"); + System.out.println(); + System.out.println("Usage: SAMTools "); + System.out.println(); + System.out.println("Commands:"); + System.out.println(" help"); + System.out.println(" view "); + System.out.println(); + } + + private boolean parseArguments(final String[] args) { + if (args.length == 0) { + usage(); + return true; + } + final String command = args[0]; + final int argpos = 1; + final int argcount = args.length - argpos; + if (command.equals("help")) { + usage(); + return true; + } else if (command.equals("view")) { + if (argcount != 1) { + usage(); + return false; + } + mInputFile = new File(args[1]); + if (!mInputFile.exists()) { + System.out.println("Input file not found: " + mInputFile); + return false; + } + } else { + System.out.println("Unrecognized command: " + command); + System.out.println(); + usage(); + return false; + } + mCommand = command; + return true; + } + + private int run(final String[] args) + throws Exception { + if (!parseArguments(args)) { + return 1; + } + if (mCommand == null) { + return 0; + } + if (mCommand.equals("view")) { + return runView(); + } + return 1; + } + + private int runView() { + final SAMFileReader reader = new SAMFileReader(mInputFile); + final CloseableIterator iterator = reader.iterator(); + while (iterator.hasNext()) { + final SAMRecord record = iterator.next(); + System.out.println(record.format()); + } + iterator.close(); + return 0; + } +} diff --git a/lib/edu/mit/broad/sam/SAMUtils.java b/lib/edu/mit/broad/sam/SAMUtils.java new file mode 100644 index 0000000000..c17ca773cd --- /dev/null +++ b/lib/edu/mit/broad/sam/SAMUtils.java @@ -0,0 +1,269 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam; + + +/** + * Utilty methods. + */ +final class SAMUtils +{ + private static final byte COMPRESSED_EQUAL_LOW = 0; + private static final byte COMPRESSED_A_LOW = 1; + private static final byte COMPRESSED_C_LOW = 2; + private static final byte COMPRESSED_G_LOW = 4; + private static final byte COMPRESSED_T_LOW = 8; + private static final byte COMPRESSED_N_LOW = 15; + private static final byte COMPRESSED_EQUAL_HIGH = COMPRESSED_EQUAL_LOW << 4; + private static final byte COMPRESSED_A_HIGH = COMPRESSED_A_LOW << 4; + private static final byte COMPRESSED_C_HIGH = COMPRESSED_C_LOW << 4; + private static final byte COMPRESSED_G_HIGH = COMPRESSED_G_LOW << 4; + private static final byte COMPRESSED_T_HIGH = (byte)(COMPRESSED_T_LOW << 4); + private static final byte COMPRESSED_N_HIGH = (byte)(COMPRESSED_N_LOW << 4); + + private SAMUtils() { + } + + static int unpackInt16(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8)); + } + + static int unpackInt32(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8) | + ((buffer[offset+2] & 0xFF) << 16) | + ((buffer[offset+3] & 0xFF) << 24)); + } + + /** + * Convert from a byte array containing =AaCcGgTtNn, to a byte array half as long, + * with =, A, C, G, T converted to 0, 1, 2, 4, 8, 15 + * @param readBases + * @return + */ + static byte[] bytesToCompressedBases(final byte[] readBases) { + final byte[] compressedBases = new byte[(readBases.length + 1)/2]; + int i; + for (i = 1; i < readBases.length; i+=2) { + compressedBases[i/2] = (byte)(charToCompressedBaseHigh(readBases[i-1]) | + charToCompressedBaseLow(readBases[i])); + } + // Last nybble + if (i == readBases.length) { + compressedBases[i/2] = charToCompressedBaseHigh((char)readBases[i-1]); + } + return compressedBases; + } + + static byte[] compressedBasesToBytes(final int length, final byte[] compressedBases, final int compressedOffset) { + final byte[] ret = new byte[length]; + int i; + for (i = 1; i < length; i+=2) { + ret[i-1] = compressedBaseToByteHigh(compressedBases[i/2 + compressedOffset]); + ret[i] = compressedBaseToByteLow(compressedBases[i/2 + compressedOffset]); + } + // Last nybble + if (i == length) { + ret[i-1] = compressedBaseToByteHigh(compressedBases[i/2 + compressedOffset]); + } + return ret; + } + + /** + * + * @param base One of =AaCcGgTtNn + * @return nybble-encoded equivalent + */ + private static byte charToCompressedBaseLow(final int base) { + switch (base) { + case '=': + return COMPRESSED_EQUAL_LOW; + case 'a': + case 'A': + return COMPRESSED_A_LOW; + case 'c': + case 'C': + return COMPRESSED_C_LOW; + case 'g': + case 'G': + return COMPRESSED_G_LOW; + case 't': + case 'T': + return COMPRESSED_T_LOW; + case 'n': + case 'N': + case '.': + return COMPRESSED_N_LOW; + default: + throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); + } + } + + private static byte charToCompressedBaseHigh(final int base) { + switch (base) { + case '=': + return COMPRESSED_EQUAL_HIGH; + case 'a': + case 'A': + return COMPRESSED_A_HIGH; + case 'c': + case 'C': + return COMPRESSED_C_HIGH; + case 'g': + case 'G': + return COMPRESSED_G_HIGH; + case 't': + case 'T': + return COMPRESSED_T_HIGH; + case 'n': + case 'N': + case '.': + return COMPRESSED_N_HIGH; + default: + throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); + } + } + + /** + * + * @param base One of COMPRESSED_* + * @return one of ACGTN= + */ + private static byte compressedBaseToByteLow(final int base) { + switch (base & 0xf) { + case COMPRESSED_EQUAL_LOW: + return '='; + case COMPRESSED_A_LOW: + return 'A'; + case COMPRESSED_C_LOW: + return 'C'; + case COMPRESSED_G_LOW: + return 'G'; + case COMPRESSED_T_LOW: + return 'T'; + case COMPRESSED_N_LOW: + return 'N'; + default: + throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); + } + } + + private static byte compressedBaseToByteHigh(final int base) { + switch ((byte)(base & 0xf0)) { + case COMPRESSED_EQUAL_HIGH: + return '='; + case COMPRESSED_A_HIGH: + return 'A'; + case COMPRESSED_C_HIGH: + return 'C'; + case COMPRESSED_G_HIGH: + return 'G'; + case COMPRESSED_T_HIGH: + return 'T'; + case COMPRESSED_N_HIGH: + return 'N'; + default: + throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); + } + } + + static String bytesToHexString(final byte[] data) { + final char[] chars = new char[2 * data.length]; + for (int i = 0; i < data.length; i++) { + final byte b = data[i]; + chars[2*i] = toHexDigit((b >> 4) & 0xF); + chars[2*i+1] = toHexDigit(b & 0xF); + } + return new String(chars); + } + + static byte[] hexStringToBytes(final String s) throws NumberFormatException { + if (s.length() % 2 != 0) { + throw new NumberFormatException("Hex representation of byte string does not have even number of hex chars: " + s); + } + final byte[] ret = new byte[s.length() / 2]; + for (int i = 0; i < ret.length; ++i) { + ret[i] = (byte) (fromHexDigit(s.charAt(i * 2)) << 4 + fromHexDigit(s.charAt(i * 2 + 1))); + } + return ret; + } + + static String phredToFastq(final byte[] data) { + if (data == null) { + return null; + } + return phredToFastq(data, 0, data.length); + } + + static String phredToFastq(final byte[] buffer, final int offset, final int length) { + final char[] chars = new char[length]; + for (int i = 0; i < length; i++) { + chars[i] = phredToFastq(buffer[offset+i] & 0xFF); + } + return new String(chars); + } + + static char phredToFastq(final int phredScore) { + if (phredScore < 0 || phredScore > 63) { + throw new IllegalArgumentException("Cannot encode phred score: " + phredScore); + } + return (char) (33 + phredScore); + } + + static byte[] fastqToPhred(final String fastq) { + if (fastq == null) { + return null; + } + final int length = fastq.length(); + final byte[] scores = new byte[length]; + for (int i = 0; i < length; i++) { + scores[i] = (byte) fastqToPhred(fastq.charAt(i)); + } + return scores; + } + + static int fastqToPhred(final char ch) { + if (ch < 33 || ch > 126) { + throw new IllegalArgumentException("Invalid fastq character: " + ch); + } + return (ch - 33); + } + + private static char toHexDigit(final int value) { + return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10)); + } + + private static int fromHexDigit(final char c) throws NumberFormatException { + final int ret = Character.digit(c, 16); + if (ret == -1) { + throw new NumberFormatException("Not a valid hex digit: " + c); + } + return ret; + } + + /** + * calculate the bin given an alignment in [beg,end) + * Copied from SAM spec. + */ + static int reg2bin(final int beg, int end) + { + + --end; + + if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); + if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); + if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); + if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); + if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); + return 0; + } +} diff --git a/lib/edu/mit/broad/sam/TextCigarCodec.java b/lib/edu/mit/broad/sam/TextCigarCodec.java new file mode 100755 index 0000000000..a1abc2620b --- /dev/null +++ b/lib/edu/mit/broad/sam/TextCigarCodec.java @@ -0,0 +1,78 @@ +/* + The Broad Institute + SOFTWARE COPYRIGHT NOTICE AGREEMENT + This software and its documentation are copyright 2009 by the + Broad Institute/Massachusetts Institute of Technology. All rights are + reserved. + + This software is supplied without any warranty or guaranteed support + whatsoever. Neither the Broad Institute nor MIT can be responsible for its + use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +/** + * Convert between string and internal CIGAR representations + */ +public class TextCigarCodec +{ + private static final byte ZERO_BYTE = "0".getBytes()[0]; + private static final byte NINE_BYTE = "9".getBytes()[0]; + + private static final TextCigarCodec singleton = new TextCigarCodec(); + + /** + * It is not necssary to get the singleton but it is preferrable to use the same one + * over and over vs. creating a new object for each BAMRecord. + */ + static TextCigarCodec getSingleton() { + return singleton; + } + + + /** + * Convert from interal CIGAR representation to String + */ + String encode(final Cigar cigar) { + if (cigar.numCigarElements() == 0) { + return SAMRecord.NO_ALIGNMENT_CIGAR; + } + final StringBuilder ret = new StringBuilder(); + for (final CigarElement cigarElement : cigar.getCigarElements()) { + ret.append(cigarElement.getLength()); + ret.append(cigarElement.getOperator()); + } + return ret.toString(); + } + + Cigar decode(final String textCigar) { + if (SAMRecord.NO_ALIGNMENT_CIGAR.equals(textCigar)) { + return new Cigar(); + } + final Cigar ret = new Cigar(); + final byte[] cigarBytes = textCigar.getBytes(); + for (int i = 0; i < cigarBytes.length; ++i) { + if (!isDigit(cigarBytes[i])) { + throw new IllegalArgumentException("Malformed CIGAR string: " + textCigar); + } + int length = (cigarBytes[i] - ZERO_BYTE); + for (++i; isDigit(cigarBytes[i]); ++i) { + length = (length * 10) + cigarBytes[i] - ZERO_BYTE; + } + final CigarOperator operator = CigarOperator.characterToEnum(cigarBytes[i]); + ret.add(new CigarElement(length, operator)); + } + return ret; + } + + private boolean isDigit(final byte c) { + return c >= ZERO_BYTE && c <= NINE_BYTE; + } + + + +} + +/******************************************************************/ +/**************************[END OF TextCigarCodec.java]*************************/ +/******************************************************************/ diff --git a/lib/edu/mit/broad/sam/TextTagCodec.java b/lib/edu/mit/broad/sam/TextTagCodec.java new file mode 100644 index 0000000000..69fd53b1f7 --- /dev/null +++ b/lib/edu/mit/broad/sam/TextTagCodec.java @@ -0,0 +1,96 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam; + +import edu.mit.broad.sam.util.StringUtil; + +import java.util.Map; + +class TextTagCodec { + private static final int NUM_TAG_FIELDS = 3; + + /** + * This is really a local variable of decode(), but allocated here to reduce allocations. + */ + private final String[] fields = new String[NUM_TAG_FIELDS]; + + String encode(final String key, Object value) { + final StringBuilder sb = new StringBuilder(key); + sb.append(':'); + char tagType = BinaryTagCodec.getTagValueType(value); + switch (tagType) { + case 'c': + case 'C': + case 's': + case 'S': + case 'I': + tagType = 'i'; + } + if (tagType == 'H') { + value = SAMUtils.bytesToHexString((byte[])value); + } + sb.append(tagType); + sb.append(':'); + sb.append(value.toString()); + return sb.toString(); + } + + Map.Entry decode(final String tag) { + final int numFields = StringUtil.split(tag, fields, ':'); + if (numFields != TextTagCodec.NUM_TAG_FIELDS) { + throw new SAMFormatException("Not enough fields in tag '" + tag + "'"); + } + final String key = fields[0]; + final String type = fields[1]; + final String stringVal = fields[2]; + final Object val; + if (type.equals("Z")) { + val = stringVal; + } else if (type.equals("A")) { + if (stringVal.length() != 1) { + throw new SAMFormatException("Tag of type A should have a single-character value"); + } + val = stringVal.charAt(0); + } else if (type.equals("i")) { + try { + val = new Integer(stringVal); + } catch (NumberFormatException e) { + throw new SAMFormatException("Tag of type i should have signed decimal value"); + } + } else if (type.equals("f")) { + try { + val = new Float(stringVal); + } catch (NumberFormatException e) { + throw new SAMFormatException("Tag of type f should have single-precision floating point value"); + } + } else if (type.equals("H")) { + try { + val = SAMUtils.hexStringToBytes(stringVal); + } catch (NumberFormatException e) { + throw new SAMFormatException("Tag of type H should have valid hex string with even number of digits"); + } + } else { + throw new SAMFormatException("Unrecognized tag type: " + type); + } + return new Map.Entry() { + public String getKey() { + return key; + } + + public Object getValue() { + return val; + } + + public Object setValue(final Object o) { + throw new UnsupportedOperationException(); + } + }; + } +} diff --git a/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java b/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java new file mode 100644 index 0000000000..99a3917fff --- /dev/null +++ b/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java @@ -0,0 +1,132 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.apps; + +import edu.mit.broad.sam.SAMFileReader; +import edu.mit.broad.sam.SAMLocusIterator; +import edu.mit.broad.sam.SAMFileHeader; + +import java.io.File; +import java.io.IOException; +import java.io.Writer; +import java.io.FileWriter; +import java.util.List; + +public class AccumulateCoverage { + + public static void main(final String[] argv) throws Exception { + if (argv.length != 1) { + System.err.println("ERROR: Incorrect number of arguments"); + usage(); + System.exit(1); + } + final AccumulateCoverage ac = new AccumulateCoverage(argv[0]); + } + + private static void usage() { + System.err.println("USAGE: AccumulateCoverage "); + } + + + + public AccumulateCoverage(final String samFile) throws IOException { + final long startTime = System.currentTimeMillis(); + final Writer writer = new FileWriter("/Users/kcibul/projects/sam/acccov.out"); + + final SAMFileReader samReader = new SAMFileReader(new File(samFile)); + + // ensure the file is sorted +//TODO: is the SAM reader implementation broken? + if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { + System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder()); + System.exit(1); + } + + final SAMLocusIterator sli = new SAMLocusIterator(samReader.iterator()); + + for (final SAMLocusIterator.LocusInfo li : sli) { + + String chrom = li.getChrom().substring(3); + if (chrom.equals("M")) { chrom = "0"; } + if (chrom.equals("X")) { chrom = "23"; } + if (chrom.equals("Y")) { chrom = "24"; } + + final StringBuilder sb = new StringBuilder(); + sb.append(chrom) + .append(":") + .append(li.getPosition()-1) + .append(" ") + .append(li.getBases().size()) + .append("\n"); + + writer.write(sb.toString()); + //System.out.print(sb); + +// // TODO: zero based or 1 based? +// System.out.print(li.chrom + "\t" + (li.position-1) + "\t" + li.bases.size() + "\t"); +// +// // TODO: print and capitalize by strand (like pileup) +// System.out.print(bytesToString(li.bases)); +// System.out.print("\t"); +// System.out.print(phredToFastq(li.qualities)); +// System.out.print("\n"); + } + + + writer.flush(); + writer.close(); + final long elapsed = System.currentTimeMillis() - startTime; + + System.out.println("Completed in " + elapsed + "ms"); + } + + + static String bytesToString(final List data) { + if (data == null || data.size() == 0) { + return null; + } + + final char[] chars = new char[data.size()]; + for (int i = 0; i < data.size(); i++) { + chars[i] = (char) (data.get(i) & 0xFF); + } + return new String(chars); + } + + + static String phredToFastq(final List data) { + final byte[] arrData = new byte[data.size()]; + for(int i=0; i< data.size(); i++) { arrData[i] = data.get(i); } + return phredToFastq(arrData); + } + + static String phredToFastq(final byte[] data) { + if (data == null) { + return null; + } + return phredToFastq(data, 0, data.length); + } + + static String phredToFastq(final byte[] buffer, final int offset, final int length) { + final char[] chars = new char[length]; + for (int i = 0; i < length; i++) { + chars[i] = phredToFastq(buffer[offset+i] & 0xFF); + } + return new String(chars); + } + + static char phredToFastq(final int phredScore) { + if (phredScore < 0 || phredScore > 63) { + throw new IllegalArgumentException("Cannot encode phred score: " + phredScore); + } + return (char) (33 + phredScore); + } + +} \ No newline at end of file diff --git a/lib/edu/mit/broad/sam/apps/CompareSAMs.java b/lib/edu/mit/broad/sam/apps/CompareSAMs.java new file mode 100644 index 0000000000..8b0ca1b572 --- /dev/null +++ b/lib/edu/mit/broad/sam/apps/CompareSAMs.java @@ -0,0 +1,486 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.apps; + +import edu.mit.broad.sam.*; + +import java.io.File; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class CompareSAMs { + public static void main(final String[] argv) { + if (argv.length != 2) { + System.err.println("ERROR: Incorrect number of arguments"); + usage(); + System.exit(1); + } + final CompareSAMs compareSAMs = new CompareSAMs(argv); + if (!compareSAMs.areEqual()) { + System.exit(1); + } + } + + private static void usage() { + System.err.println("USAGE: CompareSAMS "); + } + + private final String[] samFiles; + private final SAMFileReader[] samReaders = new SAMFileReader[2]; + private boolean sequenceDictionariesDiffer; + private int mappingsMatch = 0; + private int unmappedBoth = 0; + private int unmappedLeft = 0; + private int unmappedRight = 0; + private int mappingsDiffer = 0; + private int missingLeft = 0; + private int missingRight = 0; + private boolean areEqual; + + public CompareSAMs(final String[] samFiles) { + this.samFiles = samFiles; + for (int i = 0; i < samFiles.length; ++i) { + samReaders[i] = new SAMFileReader(new File(samFiles[i])); + } + areEqual = compareHeaders(); + areEqual = compareAlignments() && areEqual; + printReport(); + if (!areEqual) { + System.out.println("SAM files differ."); + } else { + System.out.println("SAM files match."); + } + } + + private void printReport() { + System.out.println("Match\t" + mappingsMatch); + System.out.println("Differ\t" + mappingsDiffer); + System.out.println("Unmapped_both\t" + unmappedBoth); + System.out.println("Unmapped_left\t" + unmappedLeft); + System.out.println("Unmapped_right\t" + unmappedRight); + System.out.println("Missing_left\t" + missingLeft); + System.out.println("Missing_right\t" + missingRight); + } + + private boolean compareAlignments() { + if (!compareValues(samReaders[0].getFileHeader().getSortOrder(), samReaders[1].getFileHeader().getSortOrder(), + "Sort Order")) { + System.out.println("Cannot compare alignments if sort orders differ."); + return false; + } + switch (samReaders[0].getFileHeader().getSortOrder()) { + case coordinate: + if (sequenceDictionariesDiffer) { + System.out.println("Cannot compare coordinate-sorted SAM files because sequence dictionaries differ."); + return false; + } + return compareCoordinateSortedAlignments(); + case queryname: + return compareQueryNameSortedAlignments(); + case unsorted: + return compareUnsortedAlignments(); + default: + // unreachable + assert(false); + return false; + } + } + + + private boolean compareCoordinateSortedAlignments() { + final NotPrimarySkippingIterator itLeft = + new NotPrimarySkippingIterator(samReaders[0].iterator()); + final NotPrimarySkippingIterator itRight = + new NotPrimarySkippingIterator(samReaders[1].iterator()); + + // Save any reads which haven't been matched during in-order scan. + final Map leftUnmatched = new HashMap(); + final Map rightUnmatched = new HashMap(); + + boolean ret = true; + + while (itLeft.hasCurrent()) { + if (!itRight.hasCurrent()) { + // Exhausted right side. See if any of the remaining left reads match + // any of the saved right reads. + for( ; itLeft.hasCurrent(); itLeft.advance()) { + final SAMRecord left = itLeft.getCurrent(); + final SAMRecord right = rightUnmatched.remove(left.getReadName()); + if (right == null) { + ++missingRight; + } else { + tallyAlignmentRecords(left, right); + } + } + break; + } + // Don't assume stability of order beyond the coordinate. Therefore grab all the + // reads from the left that has the same coordinate. + final SAMRecord left = itLeft.getCurrent(); + final Map leftCurrentCoordinate = new HashMap(); + leftCurrentCoordinate.put(left.getReadName(), left); + while (itLeft.advance()) { + final SAMRecord nextLeft = itLeft.getCurrent(); + if (compareAlignmentCoordinates(left, nextLeft) == 0) { + leftCurrentCoordinate.put(nextLeft.getReadName(), nextLeft); + } else { + break; + } + } + // Advance the right iterator until it is >= the left reads that have just been grabbed + while (itRight.hasCurrent() && compareAlignmentCoordinates(left, itRight.getCurrent()) > 0) { + final SAMRecord right = itRight.getCurrent(); + rightUnmatched.put(right.getReadName(), right); + itRight.advance(); + } + // For each right read that has the same coordinate as the current left reads, + // see if there is a matching left read. If so, process and discard. If not, + // save the right read for later. + for (;itRight.hasCurrent() && compareAlignmentCoordinates(left, itRight.getCurrent()) == 0; itRight.advance()) { + final SAMRecord right = itRight.getCurrent(); + final SAMRecord matchingLeft = leftCurrentCoordinate.remove(right.getReadName()); + if (matchingLeft != null) { + ret = tallyAlignmentRecords(matchingLeft, right) && ret; + } else { + rightUnmatched.put(right.getReadName(), right); + } + } + + // Anything left in leftCurrentCoordinate has not been matched + for (final SAMRecord samRecord : leftCurrentCoordinate.values()) { + leftUnmatched.put(samRecord.getReadName(), samRecord); + } + } + // The left iterator has been exhausted. See if any of the remaining right reads + // match any of the saved left reads. + for( ; itRight.hasCurrent(); itRight.advance()) { + final SAMRecord right = itRight.getCurrent(); + final SAMRecord left = leftUnmatched.remove(right.getReadName()); + if (left != null) { + tallyAlignmentRecords(left, right); + } else { + ++missingLeft; + } + } + + // Look up reads that were unmatched from left, and see if they are in rightUnmatched. + // If found, remove from rightUnmatched and tally. + for (final Map.Entry leftEntry : leftUnmatched.entrySet()) { + final String readName = leftEntry.getKey(); + final SAMRecord left = leftEntry.getValue(); + final SAMRecord right = rightUnmatched.remove(readName); + if (right == null) { + ++missingRight; + continue; + } + tallyAlignmentRecords(left, right); + } + + // Any elements remaining in rightUnmatched are guaranteed not to be in leftUnmatched. + missingLeft += rightUnmatched.size(); + + if (ret) { + if (missingLeft > 0 || missingRight > 0 || mappingsDiffer > 0 || unmappedLeft > 0 || unmappedRight > 0) { + ret = false; + } + } + return ret; + } + + private int compareAlignmentCoordinates(final SAMRecord left, final SAMRecord right) { + final String leftReferenceName = left.getReferenceName(); + final String rightReferenceName = right.getReferenceName(); + if (leftReferenceName == null && rightReferenceName == null) { + return 0; + } else if (leftReferenceName == null) { + return 1; + } else if (rightReferenceName == null) { + return -1; + } + final int leftReferenceIndex = samReaders[0].getFileHeader().getSequenceIndex(leftReferenceName); + final int rightReferenceIndex = samReaders[0].getFileHeader().getSequenceIndex(rightReferenceName); + assert(leftReferenceIndex >= 0); + assert(rightReferenceIndex >= 0); + if (leftReferenceIndex != rightReferenceIndex) { + return leftReferenceIndex - rightReferenceIndex; + } + return left.getAlignmentStart() - right.getAlignmentStart(); + } + + private boolean compareQueryNameSortedAlignments() { + final NotPrimarySkippingIterator it1 = new NotPrimarySkippingIterator(samReaders[0].iterator()); + final NotPrimarySkippingIterator it2 = new NotPrimarySkippingIterator(samReaders[1].iterator()); + + boolean ret = true; + while (it1.hasCurrent()) { + if (!it2.hasCurrent()) { + missingRight += countRemaining(it1); + return false; + } + final int cmp = it1.getCurrent().getReadName().compareTo(it2.getCurrent().getReadName()); + if (cmp < 0) { + ++missingRight; + it1.advance(); + ret = false; + } else if (cmp > 0) { + ++missingLeft; + it2.advance(); + ret = false; + } else { + if (!tallyAlignmentRecords(it1.getCurrent(), it2.getCurrent())) { + ret = false; + } + it1.advance(); + it2.advance(); + } + } + if (it2.hasCurrent()) { + missingLeft += countRemaining(it2); + return false; + } + return ret; + } + + private boolean compareUnsortedAlignments() { + final NotPrimarySkippingIterator it1 = new NotPrimarySkippingIterator(samReaders[0].iterator()); + final NotPrimarySkippingIterator it2 = new NotPrimarySkippingIterator(samReaders[1].iterator()); + boolean ret = true; + for (; it1.hasCurrent(); it1.advance(), it2.advance()) { + if (!it2.hasCurrent()) { + missingRight += countRemaining(it1); + return false; + } + final SAMRecord s1 = it1.getCurrent(); + final SAMRecord s2 = it2.getCurrent(); + if (!compareValues(s1.getReadName(), s2.getReadName(), "Read names")) { + System.out.println("Read names cease agreeing in unsorted SAM files . Comparison aborting."); + } + ret = tallyAlignmentRecords(s1, s2) && ret; + } + + if (it2.hasCurrent()) { + missingLeft += countRemaining(it2); + return false; + } + return ret; + } + + private int countRemaining(final NotPrimarySkippingIterator it) { + int i; + for (i = 0; it.hasCurrent(); ++i) { + it.advance(); + } + return i; + } + + private boolean tallyAlignmentRecords(final SAMRecord s1, final SAMRecord s2) { + assert (s1.getReadName().equals(s2.getReadName())); + if (s1.getReadUnmappedFlag() && s2.getReadUnmappedFlag()) { + ++unmappedBoth; + return true; + } + if (s1.getReadUnmappedFlag()) { + ++unmappedLeft; + return false; + } + if (s2.getReadUnmappedFlag()) { + ++unmappedRight; + return false; + } + final boolean ret = (s1.getReferenceName().equals(s2.getReferenceName()) && + s1.getAlignmentStart() == s2.getAlignmentStart() && + s1.getReadNegativeStrandFlag() == s1.getReadNegativeStrandFlag()); + if (!ret) { + ++mappingsDiffer; + } else { + ++mappingsMatch; + } + return ret; + } + + + private boolean compareHeaders() { + final SAMFileHeader h1 = samReaders[0].getFileHeader(); + final SAMFileHeader h2 = samReaders[1].getFileHeader(); + boolean ret = compareValues(h1.getVersion(), h2.getVersion(), "File format version"); + ret = compareValues(h1.getCreator(), h2.getCreator(), "File creator") && ret; + ret = compareValues(h1.getAttribute("SO"), h2.getAttribute("SO"), "Sort order") && ret; + if (!compareSequenceDictionaries(h1, h2)) { + ret = false; + sequenceDictionariesDiffer = true; + } + ret = compareReadGroups(h1, h2) && ret; + ret = compareProgramRecords(h1, h2) && ret; + return ret; + } + + private boolean compareProgramRecords(final SAMFileHeader h1, final SAMFileHeader h2) { + final List l1 = h1.getProgramRecords(); + final List l2 = h2.getProgramRecords(); + if (!compareValues(l1.size(), l2.size(), "Number of read groups")) { + return false; + } + boolean ret = true; + for (int i = 0; i < l1.size(); ++i) { + ret = compareProgramRecord(l1.get(i), l2.get(i)) && ret; + } + return ret; + } + + private boolean compareProgramRecord(final SAMProgramRecord programRecord1, final SAMProgramRecord programRecord2) { + if (programRecord1 == null && programRecord2 == null) { + return true; + } + if (programRecord1 == null) { + reportDifference("null", programRecord2.getProgramGroupId(), "Program Record"); + return false; + } + if (programRecord2 == null) { + reportDifference(programRecord1.getProgramGroupId(), "null", "Program Record"); + return false; + } + boolean ret = compareValues(programRecord1.getProgramGroupId(), programRecord2.getProgramGroupId(), + "Program Name"); + final String[] attributes = {"VN", "CL"}; + for (final String attribute: attributes) { + ret = compareValues(programRecord1.getAttribute(attribute), programRecord2.getAttribute(attribute), + attribute + " Program Record attribute") && ret; + } + return ret; + } + + private boolean compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2) { + final List l1 = h1.getReadGroups(); + final List l2 = h2.getReadGroups(); + if (!compareValues(l1.size(), l2.size(), "Number of read groups")) { + return false; + } + boolean ret = true; + for (int i = 0; i < l1.size(); ++i) { + ret = compareReadGroup(l1.get(i), l2.get(i)) && ret; + } + return ret; + } + + private boolean compareReadGroup(final SAMReadGroupRecord samReadGroupRecord1, final SAMReadGroupRecord samReadGroupRecord2) { + boolean ret = compareValues(samReadGroupRecord1.getReadGroupId(), samReadGroupRecord2.getReadGroupId(), + "Read Group ID"); + ret = compareValues(samReadGroupRecord1.getSample(), samReadGroupRecord2.getSample(), + "Sample for read group " + samReadGroupRecord1.getReadGroupId()) && ret; + ret = compareValues(samReadGroupRecord1.getLibrary(), samReadGroupRecord2.getLibrary(), + "Library for read group " + samReadGroupRecord1.getReadGroupId()) && ret; + final String[] attributes = {"DS", "PU", "PI", "CN", "DT", "PL"}; + for (final String attribute : attributes) { + ret = compareValues(samReadGroupRecord1.getAttribute(attribute), samReadGroupRecord2.getAttribute(attribute), + attribute + " for read group " + samReadGroupRecord1.getReadGroupId()) && ret; + } + return ret; + } + + private boolean compareSequenceDictionaries(final SAMFileHeader h1, final SAMFileHeader h2) { + final List s1 = h1.getSequences(); + final List s2 = h2.getSequences(); + if (s1.size() != s2.size()) { + reportDifference(s1.size(), s2.size(), "Length of sequence dictionaries"); + return false; + } + boolean ret = true; + for (int i = 0; i < s1.size(); ++i) { + ret = compareSequenceRecord(s1.get(i), s2.get(i), i+1) && ret; + } + return ret; + } + + private boolean compareSequenceRecord(final SAMSequenceRecord sequenceRecord1, final SAMSequenceRecord sequenceRecord2, final int which) { + if (!sequenceRecord1.getSequenceName().equals(sequenceRecord2.getSequenceName())) { + reportDifference(sequenceRecord1.getSequenceName(), sequenceRecord2.getSequenceName(), + "Name of sequence record " + which); + return false; + } + boolean ret = compareValues(sequenceRecord1.getSequenceLength(), sequenceRecord2.getSequenceLength(), "Length of sequence " + + sequenceRecord1.getSequenceName()); + ret = compareValues(sequenceRecord1.getSpecies(), sequenceRecord2.getSpecies(), "Species of sequence " + + sequenceRecord1.getSequenceName()) && ret; + ret = compareValues(sequenceRecord1.getAssembly(), sequenceRecord2.getAssembly(), "Assembly of sequence " + + sequenceRecord1.getSequenceName()) && ret; + ret = compareValues(sequenceRecord1.getAttribute("M5"), sequenceRecord2.getAttribute("M5"), "MD5 of sequence " + + sequenceRecord1.getSequenceName()) && ret; + ret = compareValues(sequenceRecord1.getAttribute("UR"), sequenceRecord2.getAttribute("UR"), "URI of sequence " + + sequenceRecord1.getSequenceName()) && ret; + return ret; + } + + private boolean compareValues(final T v1, final T v2, final String label) { + if (v1 == null) { + if (v2 == null) { + return true; + } + reportDifference(v1, v2, label); + return false; + } + if (v2 == null) { + reportDifference(v1, v2, label); + return false; + } + if (!v1.equals(v2)) { + reportDifference(v1, v2, label); + return false; + } + return true; + } + + private void reportDifference(final String s1, final String s2, final String label) { + System.out.println(label + " differs."); + System.out.println(samFiles[0] + ": " + s1); + System.out.println(samFiles[1] + ": " + s2); + } + private void reportDifference(Object o1, Object o2, final String label) { + if (o1 == null) { + o1 = "null"; + } + if (o2 == null) { + o2 = "null"; + } + reportDifference(o1.toString(), o2.toString(), label); + } + + public int getMappingsMatch() { + return mappingsMatch; + } + + public int getUnmappedBoth() { + return unmappedBoth; + } + + public int getUnmappedLeft() { + return unmappedLeft; + } + + public int getUnmappedRight() { + return unmappedRight; + } + + public int getMappingsDiffer() { + return mappingsDiffer; + } + + public int getMissingLeft() { + return missingLeft; + } + + public int getMissingRight() { + return missingRight; + } + + public boolean areEqual() { + return areEqual; + } +} diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java b/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java new file mode 100644 index 0000000000..9265d539c4 --- /dev/null +++ b/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java @@ -0,0 +1,166 @@ +package edu.mit.broad.sam.apps.allelecaller; + +import edu.mit.broad.sam.SAMLocusIterator; +import edu.mit.broad.arachne.FastbReader; + +import java.io.IOException; +import java.io.BufferedWriter; +import java.io.File; +import java.util.SortedSet; +import java.util.List; + +/** + * Base class for AlleleCallers. Handles efficient access to the reference, output of data to a + * standard file format, and application of priors + */ +public abstract class AbstractAlleleCaller { + // writer for output + private final BufferedWriter writer; + + // for providing access to reference data + // TODO: replace with standard mechanism when defined/implemented + private final FastbReader fastbReader; + private String cachedChromName; + private String cachedChrom; + + public AbstractAlleleCaller(final File fastbReference, final BufferedWriter writer) throws IOException { + this.writer = writer; + this.fastbReader = new FastbReader(fastbReference); + } + + + /** + * emit allele calls to the writer specified in the constructor + * + * @param li Locus to call + */ + public void callAlleles(final SAMLocusIterator.LocusInfo li) throws IOException { + + // TODO: replace with standard mechanism when defined/implemented (making use of SAM Header) + // make sure we have access to reference chrom information + if (!li.getChrom().equals(cachedChromName)) { + final int contig = translateChromToContig(li.getChrom()); + cachedChrom = null; // CRITICAL -- to allow for GC + cachedChrom = fastbReader.readSequence(contig); + cachedChromName = li.getChrom(); + } + + final char ref = cachedChrom.charAt(li.getPosition() - 1); + + + // delegate to the specific implementation + final SortedSet likelihoods = call(ref, li.getBasesAsString(), li.getQualities()); + + + final GenotypeTheory bestTheory = likelihoods.first(); + GenotypeTheory nextBestTheory = null; + GenotypeTheory refTheory = null; + final String refString = new String(new char[]{ref,ref}); + final DiploidGenotype refGenotype = DiploidGenotype.valueOf(refString); + + + final StringBuilder theoryString = new StringBuilder(); + int k=0; + for(final GenotypeTheory t : likelihoods) { + if (k == 1) { nextBestTheory = t; } + if (t.getGenotype() == refGenotype) { refTheory = t; } + + theoryString.append(t.getGenotype()) + .append(":") + .append(String.format("%.2f",t.getLikelihood())) + .append(" "); + k++; + } + + final double btnb = bestTheory.getLikelihood() - nextBestTheory.getLikelihood(); + final double btr = bestTheory.getLikelihood() - refTheory.getLikelihood(); + + final DiploidGenotype gt = likelihoods.first().getGenotype(); + + final String type; + if (!gt.isHet() && gt.getAllele1() == ref) { + type = "homozygous"; + } else if (!gt.isHet() && gt.getAllele1() != ref) { + type = "homozygous-SNP"; + } else { + type = "heterozygous-SNP"; + } + + final String bases = li.getBasesAsString(); + int a = 0,c = 0,g = 0,t = 0; + for(int i=0; i call(char ref, String bases, List quals); + + + /** + * Apply a general population-based prior to the likelihood: + *
      + *
    • ref is .999
    • + *
    • het is 10^-3
    • + *
    • homozygous, non-reference is 10^-5
    • + * + * @param ref reference allele + * @param allele1 first allele of the genotype + * @param allele2 second allele of the genotype + * @return prior, given the reference and genotype alleles + */ + protected double getPrior(final char ref, final DiploidGenotype gt) { + final double prior; + if (gt.isHom() && gt.getAllele1() == ref) { + prior = 0.999; // reference + } else { + if (gt.getAllele1() != ref && gt.getAllele2() != ref) { + prior = 0.00001; // neither base is reference + } else { + prior = 0.001; // het, one base is reference + } + } + return prior; + } + + // -------------------------------------------------------------------------------------------- + // Helper methods below this point... + // -------------------------------------------------------------------------------------------- + + + private final String[] chroms = new String[]{"chrM","chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chr1_random","chr2_random","chr3_random","chr4_random","chr5_random","chr6_random","chr7_random","chr8_random","chr9_random","chr10_random","chr11_random","chr13_random","chr15_random","chr16_random","chr17_random","chr18_random","chr19_random","chr21_random","chr22_random","chrX_random"}; + private int translateChromToContig(final String chrom) { + for(int i=0; i "); + } + + + private SAMFileReader getSamReader(final File samFile) { + final SAMFileReader samReader = new SAMFileReader(samFile); + + // ensure the file is sorted + if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { + System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder()); + System.exit(1); + } + + return samReader; + } + +} \ No newline at end of file diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java b/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java new file mode 100644 index 0000000000..d259a60752 --- /dev/null +++ b/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java @@ -0,0 +1,27 @@ +package edu.mit.broad.sam.apps.allelecaller; + +public enum DiploidGenotype { + AA('A','A'), + AC('A','C'), + AG('A','G'), + AT('A','T'), + CC('C','C'), + CG('C','G'), + CT('C','T'), + GG('G','G'), + GT('G','T'), + TT('T','T'); + + private final char allele1; + private final char allele2; + + private DiploidGenotype(final char allele1, final char allele2) { + this.allele1 = allele1; + this.allele2 = allele2; + } + + public char getAllele1() { return allele1; } + public char getAllele2() { return allele2; } + public boolean isHet() { return this.allele1 != this.allele2; } + public boolean isHom() { return this.allele1 == this.allele2; } +} diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java b/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java new file mode 100644 index 0000000000..7a77d4524d --- /dev/null +++ b/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java @@ -0,0 +1,74 @@ +package edu.mit.broad.sam.apps.allelecaller; + +import java.io.IOException; +import java.io.BufferedWriter; +import java.io.File; +import java.util.*; +import static java.lang.Math.*; + + +/** + * Bayesian-based allele caller using flat qualities and a 1e-3 error rate, based on CRD algorithm + */ +public class FlatQualityAlleleCaller extends AbstractAlleleCaller { + + public FlatQualityAlleleCaller(final File fastbReference, final BufferedWriter writer) throws IOException { + super(fastbReference, writer); + } + + + protected SortedSet call(final char ref, final String bases, final List quals) { + final float eps = 1e-3f; + + // count up the base by nucleotide and put them into a map + final int depth = bases.length(); + int a = 0,c = 0,g = 0,t = 0; + for(int i=0; i< bases.length(); i++) { + if (bases.charAt(i) == 'A') { a++; } + else if (bases.charAt(i) == 'C') { c++; } + else if (bases.charAt(i) == 'G') { g++; } + else if (bases.charAt(i) == 'T') { t++; } + else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); } + } + + final Map counts = new HashMap(); + counts.put('A', a); + counts.put('C', c); + counts.put('G', g); + counts.put('T', t); + + + // for each of the 10 theories, calculate the likelihood + final SortedSet results = new TreeSet(); + for(final DiploidGenotype theory : DiploidGenotype.values()) { + final double likelihood; + final char allele1 = theory.getAllele1(); + final char allele2 = theory.getAllele2(); + + if (!theory.isHet()) { + likelihood = log10(1-eps)*counts.get(allele1) + log10(eps)*(depth - counts.get(allele1)); + } else { + final int major_allele_counts; + final int minor_allele_counts; + if (counts.get(allele1) > counts.get(allele2)) { + major_allele_counts = counts.get(allele1); + minor_allele_counts = counts.get(allele2); + } else { + major_allele_counts = counts.get(allele2); + minor_allele_counts = counts.get(allele1); + } + + likelihood = log10(0.5 - (eps/2.0) )*major_allele_counts + + log10(0.5 - (eps/2.0) )*minor_allele_counts + + log10(eps)*(depth - major_allele_counts - minor_allele_counts); + } + + final double prior = getPrior(ref, theory); + results.add(new GenotypeTheory(theory, likelihood + log10(prior))); + } + + + return results; + + } +} diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java b/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java new file mode 100644 index 0000000000..709e1c4397 --- /dev/null +++ b/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java @@ -0,0 +1,46 @@ +package edu.mit.broad.sam.apps.allelecaller; + +/** + * Datastructure to hold a single genotype along with a likelihood. + */ +public class GenotypeTheory implements Comparable { + private DiploidGenotype genotype; + private double likelihood; + + public GenotypeTheory(final DiploidGenotype genotype, final double likelihood) { + this.genotype = genotype; + this.likelihood = likelihood; + } + + public DiploidGenotype getGenotype() { + return genotype; + } + + public void setGenotype(final DiploidGenotype genotype) { + this.genotype = genotype; + } + + public double getLikelihood() { + return likelihood; + } + + public void setLikelihood(final double likelihood) { + this.likelihood = likelihood; + } + + /** + * Genotype Theories are sorted first by descending likelihood (ie + * the GenotypeTheory with biggest likelihood comes first). Ties are + * broken by lexical sorting of the genotypes themselves + * + */ + public int compareTo(final GenotypeTheory other) { + if (this.getLikelihood() == other.getLikelihood()) { + return this.getGenotype().compareTo(other.getGenotype()); + } else if (this.getLikelihood() > other.getLikelihood()) { + return -1; + } else { + return 1; + } + } +} diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java b/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java new file mode 100644 index 0000000000..23b310bd2a --- /dev/null +++ b/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java @@ -0,0 +1,80 @@ +package edu.mit.broad.sam.apps.allelecaller; + +import java.util.*; +import static java.lang.Math.log10; +import static java.lang.Math.pow; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.File; + +/** + * Bayesian-based allele caller using quality scores, based on CRD algorithm + */ +public class QualityScoreAlleleCaller extends AbstractAlleleCaller { + + public QualityScoreAlleleCaller(final File fastbReference, final BufferedWriter writer) throws IOException { + super(fastbReference, writer); + } + + protected SortedSet call(final char ref, final String bases, final List quals) { + + // for each of the 10 theories, calculate the likelihood using quality scores + final SortedSet results = new TreeSet(); + for(final DiploidGenotype theory : DiploidGenotype.values()) { + double likelihood = 0; + + for(int i=0; i 0) + { + ++lineNumber; + return StringUtil.bytesToString(lineBuffer, 0, linePosition); + } else + { + return null; + } + } + } + + + final byte b = buffer[nextChar++]; + if (b == LINEFEED || b == CARRIAGE_RETURN) + { + + if (includeTerminators) + { + lineBuffer[linePosition++] = b; + if (b == CARRIAGE_RETURN && peek() == LINEFEED) + { + lineBuffer[linePosition++] = b; + nextChar++; // <= to account for the '\n' we just ate + } + } + else { + if (b == CARRIAGE_RETURN && peek() == LINEFEED) + { + nextChar++; // <= skip the trailing \n in case of \r\n termination + } + + } + ++lineNumber; + return StringUtil.bytesToString(lineBuffer, 0, linePosition); + } else + { + // Expand line buffer size if neccessary. Reservce at least 2 characters + // for potential line-terminators in return string + + if (linePosition > (lineBuffer.length - 3)) + { + final byte[] temp = new byte[lineBuffer.length + 100]; + System.arraycopy(lineBuffer, 0, temp, 0, lineBuffer.length); + lineBuffer = temp; + } + + lineBuffer[linePosition++] = b; + } + } + } + + public int getLineNumber() { + return lineNumber; + } + + /** + * Peek ahead one character, filling from the underlying stream if neccessary. + * + * @return + * @throws java.io.IOException + */ + private byte peek(){ + // Refill buffer if neccessary + if (nextChar == nChars) + { + fill(); + if (nextChar == nChars) + { + // eof reached. + return 0; + } + } + return buffer[nextChar]; + + } + + private void fill() { + try { + nChars = is.read(buffer); + nextChar = 0; + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public void close() { + try { + is.close(); + } catch (IOException e) { + // Ignore exception + } + } +} + diff --git a/lib/edu/mit/broad/sam/util/AsciiWriter.java b/lib/edu/mit/broad/sam/util/AsciiWriter.java new file mode 100644 index 0000000000..8395cf84d2 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/AsciiWriter.java @@ -0,0 +1,55 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.Writer; + +/** + * Fast (I hope) Writer that converts char to byte merely by casting, rather than charset conversion. + */ +public class AsciiWriter extends Writer { + + private final OutputStream os; + // Buffer size has not been tuned. + private final byte[] buffer = new byte[10000]; + private int numBytes; + + public AsciiWriter(final OutputStream os) { + this.os = os; + numBytes = 0; + } + + public void close() throws IOException { + flush(); + os.close(); + } + + public void flush() throws IOException { + os.write(buffer, 0, numBytes); + numBytes = 0; + os.flush(); + } + + public void write(final char[] chars, int offset, int length) throws IOException { + while (length > 0) { + final int charsToConvert = Math.min(length, buffer.length - numBytes); + StringUtil.charsToBytes(chars, offset, charsToConvert, buffer, numBytes); + numBytes += charsToConvert; + offset += charsToConvert; + length -= charsToConvert; + if (numBytes == buffer.length) { + os.write(buffer, 0, numBytes); + numBytes = 0; + } + } + } +} diff --git a/lib/edu/mit/broad/sam/util/BinaryCodec.java b/lib/edu/mit/broad/sam/util/BinaryCodec.java new file mode 100644 index 0000000000..18191a257d --- /dev/null +++ b/lib/edu/mit/broad/sam/util/BinaryCodec.java @@ -0,0 +1,478 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * @author Dave Tefft + */ +public class BinaryCodec { + + //Outstream to write to + private OutputStream outputStream; + //If a file or filename was given it will be stored here + private String outputFileName; + + //Input stream to read from + private InputStream inputStream; + //If a file or filename was give to read from it will be stored here + private String inputFileName; + + /* + Mode that the BinaryCodec is in. It is either writing to a binary file or reading from. + This is set to true if it is writing to a binary file + Right now we don't support reading and writing to the same file with the same BinaryCodec instance + */ + private boolean isWriting; + + private ByteBuffer byteBuffer; + + //Byte order used for the Picard project + private static final ByteOrder LITTLE_ENDIAN = ByteOrder.LITTLE_ENDIAN; + private static final byte NULL_BYTE[] = {0}; + + private static final long MAX_UBYTE = (Byte.MAX_VALUE + 1) * 2; + private static final long MAX_USHORT = (Short.MAX_VALUE + 1) * 2; + private static final long MAX_UINT = ((long)Integer.MAX_VALUE + 1) * 2; + + // We never serialize more than this much at a time. + private static final int MAX_BYTE_BUFFER = 8; + + ////////////////////////////////////////////////// + // Constructors // + ////////////////////////////////////////////////// + + /** + * Constructs BinaryCodec from a file and set it's mode to writing or not + * + * @param file file to be written to or read from + * @param writing whether the file is being written to + */ + public BinaryCodec(final File file, final boolean writing) { + try { + this.isWriting = writing; + if (this.isWriting) { + this.outputStream = new FileOutputStream(file); + this.outputFileName = file.getName(); + } else { + this.inputStream = new FileInputStream(file); + this.inputFileName = file.getName(); + } + } catch (FileNotFoundException e) { + throw new RuntimeIOException("File not found: " + file, e); + } + initByteBuffer(); + } + + /** + * Constructs BinaryCodec from a file name and set it's mode to writing or not + * + * @param fileName name of the file to be written to or read from + * @param writing writing whether the file is being written to + */ + public BinaryCodec(final String fileName, final boolean writing) { + this(new File(fileName), writing); + } + + /** + * Constructs BinaryCodec from an output stream + * + * @param outputStream Stream to write to, since it's an output stream we know that isWriting + * should be set to true + */ + public BinaryCodec(final OutputStream outputStream) { + isWriting = true; + this.outputStream = outputStream; + initByteBuffer(); + } + + /** + * Constructs BinaryCodec from an input stream + * + * @param inputStream Stream to read from, since we are reading isWriting is set to false + */ + public BinaryCodec(final InputStream inputStream) { + isWriting = false; + this.inputStream = inputStream; + initByteBuffer(); + } + + /** + * Shared among ctors + */ + private void initByteBuffer() { + byteBuffer = ByteBuffer.allocate(MAX_BYTE_BUFFER); + byteBuffer.order(LITTLE_ENDIAN); + } + + ////////////////////////////////////////////////// + // Writing methods // + ////////////////////////////////////////////////// + + + /** + * Write whatever has been put into the byte buffer + * @param numBytes -- how much to write. Note that in case of writing an unsigned value, + * more bytes were put into the ByteBuffer than will get written out. + */ + private void writeByteBuffer(final int numBytes) { + assert(numBytes <= byteBuffer.limit()); + writeBytes(byteBuffer.array(), 0, numBytes); + } + + /** + * Writes a byte to the output buffer + * + * @param bite byte array to write + */ + public void writeByte(final byte bite) { + byteBuffer.clear(); + byteBuffer.put(bite); + writeByteBuffer(1); + } + + public void writeByte(final int b) { + writeByte((byte)b); + } + + /** + * Writes a byte array to the output buffer + * + * @param bytes byte array to write + */ + public void writeBytes(final byte[] bytes) { + writeBytes(bytes, 0, bytes.length); + } + + public void writeBytes(final byte[] bytes, final int startOffset, final int numBytes) { + if (!isWriting) { + throw new IllegalStateException("Calling write method on BinaryCodec open for read."); + } + try { + outputStream.write(bytes, startOffset, numBytes); + } catch (IOException e) { + throw new RuntimeIOException(constructErrorMessage("Write error"), e); + } + } + + /** + * Write an int to the output stream + * + * @param value int to write + */ + public void writeInt(final int value) { + byteBuffer.clear(); + byteBuffer.putInt(value); + writeByteBuffer(4); + } + + /** + * Write a double to the output stream + * + * @param value double to write + */ + public void writeDouble(final double value) { + byteBuffer.clear(); + byteBuffer.putDouble(value); + writeByteBuffer(8); + } + + /** + * Write a long to the output stream + * + * @param value long to write + */ + public void writeLong(final long value) { + byteBuffer.clear(); + byteBuffer.putLong(value); + writeByteBuffer(8); + } + + + public void writeShort(final short value) { + byteBuffer.clear(); + byteBuffer.putShort(value); + writeByteBuffer(2); + } + + /** + * Write a float to the output stream + * + * @param value float to write + */ + public void writeFloat(final float value) { + byteBuffer.clear(); + byteBuffer.putFloat(value); + writeByteBuffer(4); + } + + /** + * Writes a string to the buffer + * + * @param value string to write to buffer + * @param writeLength prefix the string with the length as an int + * @param appendNull add a null byte to the end of the string + */ + public void writeString(final String value, final boolean writeLength, final boolean appendNull) { + if (writeLength) { + int lengthToWrite = value.length(); + if (appendNull) lengthToWrite++; + writeInt(lengthToWrite); + } + + //Actually writes the string to a buffer + writeString(value); + + if (appendNull) writeBytes(NULL_BYTE); + + } + + + /** + * Write a string to the buffer + * + * @param value string to write + */ + private void writeString(final String value) { + writeBytes(StringUtil.stringToBytes(value)); + } + + // NOTE: The unsigned methods all have little-endianness built into them. + public void writeUByte(final short val) { + if (val < 0) { + throw new IllegalArgumentException("Negative value (" + val + ") passed to unsigned writing method."); + } + if (val > MAX_UBYTE) { + throw new IllegalArgumentException("Value (" + val + ") to large to be written as ubyte."); + } + byteBuffer.clear(); + byteBuffer.putShort(val); + writeByteBuffer(1); + } + + public void writeUShort(final int val) { + if (val < 0) { + throw new IllegalArgumentException("Negative value (" + val + ") passed to unsigned writing method."); + } + if (val > MAX_USHORT) { + throw new IllegalArgumentException("Value (" + val + ") to large to be written as ushort."); + } + byteBuffer.clear(); + byteBuffer.putInt(val); + writeByteBuffer(2); + } + + public void writeUInt(final long val) { + if (val < 0) { + throw new IllegalArgumentException("Negative value (" + val + ") passed to unsigned writing method."); + } + if (val > MAX_UINT) { + throw new IllegalArgumentException("Value (" + val + ") to large to be written as uint."); + } + byteBuffer.clear(); + byteBuffer.putLong(val); + writeByteBuffer(4); + } + + ////////////////////////////////////////////////// + // Reading methods // + ////////////////////////////////////////////////// + + /** + * Read a byte array off the input stream + * + * @return number of bytes read + */ + public void readBytes(final byte[] buffer) { + readBytes(buffer, 0, buffer.length); + } + + public void readBytes(final byte[] buffer, final int offset, final int length) { + final int numRead = readBytesOrFewer(buffer, offset, length); + if (numRead < length) { + throw new RuntimeEOFException(constructErrorMessage("Premature EOF")); + } + } + + public int readBytesOrFewer(final byte[] buffer, final int offset, final int length) { + if (isWriting) { + throw new IllegalStateException("Calling read method on BinaryCodec open for write."); + } + try { + return inputStream.read(buffer, offset, length); + } catch (IOException e) { + throw new RuntimeIOException(constructErrorMessage("Read error"), e); + } + } + + public byte readByte() { + readByteBuffer(1); + byteBuffer.flip(); + return byteBuffer.get(); + } + + /** + * Read a string off the input stream + * + * @param length length of string to read + * @return String read from stream + */ + public String readString(final int length) { + final byte[] buffer = new byte[length]; + readBytes(buffer); + + return StringUtil.bytesToString(buffer); + } + + public String readNullTerminatedString() { + return StringUtil.readNullTerminatedString(this); + } + + private void readByteBuffer(final int numBytes) { + assert(numBytes <= byteBuffer.capacity()); + readBytes(byteBuffer.array(), 0, numBytes); + byteBuffer.limit(byteBuffer.capacity()); + byteBuffer.position(numBytes); + } + + /** + * Read an int off the input stream + * + * @return int from input stream + */ + public int readInt() { + readByteBuffer(4); + byteBuffer.flip(); + return byteBuffer.getInt(); + } + + /** + * Reads a double off the input stream + * + * @return double + */ + public double readDouble() { + readByteBuffer(8); + byteBuffer.flip(); + return byteBuffer.getDouble(); + } + + /** + * Reads a long off the input stream + * + * @return long + */ + public long readLong() { + readByteBuffer(8); + byteBuffer.flip(); + return byteBuffer.getLong(); + } + + public short readShort() { + readByteBuffer(2); + byteBuffer.flip(); + return byteBuffer.getShort(); + } + + /** + * Reads a float off the input stream + * + * @return float + */ + public float readFloat() { + readByteBuffer(4); + byteBuffer.flip(); + return byteBuffer.getFloat(); + } + + public short readUByte() { + readByteBuffer(1); + byteBuffer.put((byte)0); + byteBuffer.flip(); + return byteBuffer.getShort(); + } + + public int readUShort() { + readByteBuffer(2); + byteBuffer.putShort((short)0); + byteBuffer.flip(); + return byteBuffer.getInt(); + } + + public long readUInt() { + readByteBuffer(4); + byteBuffer.putInt(0); + byteBuffer.flip(); + return byteBuffer.getLong(); + } + + /** + * Close the appropriate stream + */ + public void close() { + try { + if (this.isWriting) this.outputStream.close(); + else this.inputStream.close(); + } catch (IOException e) { + throw new RuntimeIOException(e.getMessage(), e); + } + } + + private String constructErrorMessage(final String msg) { + final StringBuilder sb = new StringBuilder(msg); + sb.append("; BinaryCodec in "); + sb.append(isWriting? "write": "read"); + sb.append("mode; "); + final String filename = isWriting? outputFileName: inputFileName; + if (filename != null) { + sb.append("file: "); + sb.append(filename); + } else { + sb.append("streamed file (filename not available)"); + } + return sb.toString(); + } + + ////////////////////////////////////////////////// + // Some getters // + ////////////////////////////////////////////////// + + + public String getInputFileName() { + return inputFileName; + } + + public String getOutputFileName() { + return outputFileName; + } + + public void setOutputFileName(final String outputFileName) { + this.outputFileName = outputFileName; + } + + public void setInputFileName(final String inputFileName) { + this.inputFileName = inputFileName; + } + + public boolean isWriting() { + return isWriting; + } + + public OutputStream getOutputStream() { + return outputStream; + } + + public InputStream getInputStream() { + return inputStream; + } +} diff --git a/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java b/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java new file mode 100755 index 0000000000..626e5c17c0 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java @@ -0,0 +1,258 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam.util; + + +import java.io.*; +import java.util.zip.GZIPInputStream; + +/* + * Utility class for reading BGZF block compressed files. + */ +public class BlockCompressedInputStream + extends InputStream +{ + + private InputStream mStream = null; + private RandomAccessFile mFile = null; + private byte[] mFileBuffer = null; + private byte[] mCurrentBlock = null; + private int mCurrentOffset = 0; + private long mBlockAddress = 0; + private int mLastBlockLength = 0; + + + public BlockCompressedInputStream(final InputStream stream) { + mStream = toBufferedStream(stream); + mFile = null; + } + + public BlockCompressedInputStream(final File file) + throws IOException { + mFile = new RandomAccessFile(file, "r"); + mStream = null; + } + + public int available() + throws IOException { + if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { + readBlock(); + } + if (mCurrentBlock == null) { + return 0; + } + return mCurrentBlock.length - mCurrentOffset; + } + + public void close() + throws IOException { + if (mFile != null) { + mFile.close(); + mFile = null; + } else if (mStream != null) { + mStream.close(); + mStream = null; + } + // Encourage garbage collection + mFileBuffer = null; + mCurrentBlock = null; + } + + public int read() + throws IOException { + return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; + } + + public int read(final byte[] buffer) + throws IOException { + return read(buffer, 0, buffer.length); + } + + public int read(final byte[] buffer, int offset, int length) + throws IOException { + int bytesRead = 0; + while (length > 0) { + final int available = available(); + if (available == 0) { + break; + } + final int copyLength = Math.min(length, available); + System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); + mCurrentOffset += copyLength; + offset += copyLength; + length -= copyLength; + bytesRead += copyLength; + } + return bytesRead; + } + + public void seek(final long pos) + throws IOException { + // Note: pos is a special virtual file pointer, not an actual byte offset + if (mFile == null) { + throw new IOException("Cannot seek on stream based file"); + } + // Decode virtual file pointer + // Upper 48 bits is the byte offset into the compressed stream of a block. + // Lower 16 bits is the byte offset into the uncompressed stream inside the block. + final long compressedOffset = pos >> 16; + final int uncompressedOffset = (int) (pos & 0xFFFF); + mFile.seek(compressedOffset); + mBlockAddress = compressedOffset; + mLastBlockLength = 0; + readBlock(); + if (uncompressedOffset >= available()) { + throw new IOException("Invalid file pointer: " + pos); + } + mCurrentOffset = uncompressedOffset; + } + + public long getFilePointer() { + return ((mBlockAddress << 16) | mCurrentOffset); + } + + public static boolean isValidFile(final InputStream stream) + throws IOException { + if (!stream.markSupported()) { + throw new RuntimeException("Cannot test non-buffered stream"); + } + stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; + final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + stream.reset(); + if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { + return false; + } + return isValidBlockHeader(buffer); + } + + private static boolean isValidBlockHeader(final byte[] buffer) { + return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && + (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && + (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && + buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && + buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && + buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); + } + + private void readBlock() + throws IOException { + + if (mFileBuffer == null) { + mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; + } + int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); + if (count == 0) { + return; + } + if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { + throw new IOException("Premature end of file"); + } + final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; + if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { + throw new IOException("Unexpected compressed block length: " + blockLength); + } + final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; + count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); + if (count != remaining) { + throw new IOException("Premature end of file"); + } + inflateBlock(mFileBuffer, blockLength); + mCurrentOffset = 0; + mBlockAddress += mLastBlockLength; + mLastBlockLength = blockLength; + } + + private void inflateBlock(final byte[] compressedBlock, final int compressedLength) + throws IOException { + final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); + byte[] buffer = mCurrentBlock; + mCurrentBlock = null; + if (buffer == null || buffer.length != uncompressedLength) { + buffer = new byte[uncompressedLength]; + } + final GZIPInputStream gzipStream = + new GZIPInputStream(new ByteArrayInputStream(compressedBlock, 0, compressedLength)); + try { + final int count = readBytes(gzipStream, buffer, 0, buffer.length); + if (count != buffer.length) { + throw new IOException("Block inflate failed"); + } + // Note: available() does not return zero here. + // The only safe way to test is to try to read a byte. + if (gzipStream.read() != -1) { + throw new IOException("Block inflate failed"); + } + } finally { + gzipStream.close(); + } + mCurrentBlock = buffer; + } + + private int readBytes(final byte[] buffer, final int offset, final int length) + throws IOException { + if (mFile != null) { + return readBytes(mFile, buffer, offset, length); + } else if (mStream != null) { + return readBytes(mStream, buffer, offset, length); + } else { + return 0; + } + } + + private static int readBytes(final RandomAccessFile file, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = file.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) + throws IOException { + int bytesRead = 0; + while (bytesRead < length) { + final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); + if (count <= 0) { + break; + } + bytesRead += count; + } + return bytesRead; + } + + private BufferedInputStream toBufferedStream(final InputStream stream) { + if (stream instanceof BufferedInputStream) { + return (BufferedInputStream) stream; + } else { + return new BufferedInputStream(stream); + } + } + + private int unpackInt16(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8)); + } + + private int unpackInt32(final byte[] buffer, final int offset) { + return ((buffer[offset] & 0xFF) | + ((buffer[offset+1] & 0xFF) << 8) | + ((buffer[offset+2] & 0xFF) << 16) | + ((buffer[offset+3] & 0xFF) << 24)); + } +} + + diff --git a/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java b/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java new file mode 100644 index 0000000000..11b775b88e --- /dev/null +++ b/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java @@ -0,0 +1,177 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.zip.CRC32; +import java.util.zip.Deflater; + +/** + * Writer for a file that is a series of gzip blocks. The caller just treats it as an + * OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten + * bytes reaches a threshold. Note that the flush() method should not be called by client + * unless you know what you're doing, because it forces a gzip block to be written even if the + * number of buffered bytes has not reached threshold. close(), on the other hand, must be called + * when done writing in order to force the last gzip block to be written. + */ +public class BlockCompressedOutputStream + extends OutputStream +{ + private final BinaryCodec codec; + private final byte[] uncompressedBuffer = new byte[BlockCompressedStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE]; + private int numUncompressedBytes = 0; + private final byte[] compressedBuffer = + new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE - + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; + private final Deflater deflater = new Deflater(BlockCompressedStreamConstants.GZIP_CM_DEFLATE, true); + private final CRC32 crc32 = new CRC32(); + private final byte[] singleByteArray = new byte[1]; + + private int numberOfThrottleBacks = 0; + + public BlockCompressedOutputStream(final String filename) { + codec = new BinaryCodec(filename, true); + } + + public BlockCompressedOutputStream(final File file) { + codec = new BinaryCodec(file, true); + } + + @Override + public void write(final byte[] bytes) throws IOException { + write(bytes, 0, bytes.length); + } + + @Override + public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException { + assert(numUncompressedBytes < uncompressedBuffer.length); + while (numBytes > 0) { + final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes); + System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite); + numUncompressedBytes += bytesToWrite; + startIndex += bytesToWrite; + numBytes -= bytesToWrite; + assert(numBytes >= 0); + if (numUncompressedBytes == uncompressedBuffer.length) { + deflateBlock(); + } + } + } + + /** + * WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer + * to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush(). + * Instead, call close(), which will flush any unwritten data before closing the underlying stream. + * + */ + @Override + public void flush() throws IOException { + while (numUncompressedBytes > 0) { + deflateBlock(); + } + codec.getOutputStream().flush(); + } + + /** + * close() must be called in order to flush any remaining buffered bytes. + * + */ + @Override + public void close() throws IOException { + flush(); + if (numberOfThrottleBacks > 0) { + System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks + + " times for file " + codec.getOutputFileName()); + } + codec.close(); + } + + public void write(final int i) throws IOException { + singleByteArray[0] = (byte)i; + write(singleByteArray); + } + + /** + * Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block. + * If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount + * of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked + * up in the next deflate event. + * @return size of gzip block that was written. + */ + private int deflateBlock() { + if (numUncompressedBytes == 0) { + return 0; + } + int bytesToCompress = numUncompressedBytes; + while (true) { + // Compress the input + deflater.reset(); + deflater.setInput(uncompressedBuffer, 0, bytesToCompress); + deflater.finish(); + final int compressedSize = deflater.deflate(compressedBuffer, 0, compressedBuffer.length); + + // If it didn't all fit in compressedBuffer.length, reduce the amount to + // be compressed and try again. + if (deflater.getBytesRead() < bytesToCompress) { + bytesToCompress -= BlockCompressedStreamConstants.UNCOMPRESSED_THROTTLE_AMOUNT; + ++numberOfThrottleBacks; + assert(bytesToCompress > 0); + continue; + } + // Data compressed small enough, so write it out. + crc32.reset(); + crc32.update(uncompressedBuffer, 0, bytesToCompress); + + final int totalBlockSize = writeGzipBlock(compressedSize, bytesToCompress, crc32.getValue()); + assert(bytesToCompress <= numUncompressedBytes); + + // Clear out from uncompressedBuffer the data that was written + if (bytesToCompress == numUncompressedBytes) { + numUncompressedBytes = 0; + } else { + System.arraycopy(uncompressedBuffer, bytesToCompress, uncompressedBuffer, 0, + numUncompressedBytes - bytesToCompress); + numUncompressedBytes -= bytesToCompress; + } + return totalBlockSize; + } + // unreachable + } + + /** + * Writes the entire gzip block, assuming the compressed data is stored in compressedBuffer + * @return size of gzip block that was written. + */ + private int writeGzipBlock(final int compressedSize, final int uncompressedSize, final long crc) { + // Init gzip header + codec.writeByte(BlockCompressedStreamConstants.GZIP_ID1); + codec.writeByte(BlockCompressedStreamConstants.GZIP_ID2); + codec.writeByte(BlockCompressedStreamConstants.GZIP_CM_DEFLATE); + codec.writeByte(BlockCompressedStreamConstants.GZIP_FLG); + codec.writeInt(0); // Modification time + codec.writeByte(BlockCompressedStreamConstants.GZIP_XFL); + codec.writeByte(BlockCompressedStreamConstants.GZIP_OS_UNKNOWN); + codec.writeShort(BlockCompressedStreamConstants.GZIP_XLEN); + codec.writeByte(BlockCompressedStreamConstants.BGZF_ID1); + codec.writeByte(BlockCompressedStreamConstants.BGZF_ID2); + codec.writeShort(BlockCompressedStreamConstants.BGZF_LEN); + final int totalBlockSize = compressedSize + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH + + BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH; + + // I don't know why we store block size - 1, but that is what the spec says + codec.writeShort((short)(totalBlockSize - 1)); + codec.writeBytes(compressedBuffer, 0, compressedSize); + codec.writeInt((int)crc); + codec.writeInt(uncompressedSize); + return totalBlockSize; + } +} diff --git a/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java b/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java new file mode 100644 index 0000000000..7a5ffc0ce9 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java @@ -0,0 +1,63 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +/** + * Constants shared by BlockCompressed{Input,Output}Stream classes + */ +public class BlockCompressedStreamConstants { + // Number of bytes in the gzip block before the deflated data. + // This is not the standard header size, because we include one optional subfield, + // but it is the standard for us. + public static final int BLOCK_HEADER_LENGTH = 18; + + // Location in the gzip block of the total block size (actually total block size - 1) + public static final int BLOCK_LENGTH_OFFSET = 16; + + // Number of bytes that follow the deflated data + public static final int BLOCK_FOOTER_LENGTH = 8; + + // We require that a compressed block (including header and footer, be <= this) + public static final int MAX_COMPRESSED_BLOCK_SIZE = 64 * 1024; + + // Push out a gzip block when this many uncompressed bytes have been accumulated. + public static final int DEFAULT_UNCOMPRESSED_BLOCK_SIZE = 64 * 1024; + + // If after compressing a block, the compressed block is found to be > + // MAX_COMPRESSED_BLOCK_SIZE, including overhead, then throttle back bytes to + // be compressed by this amount and try again. + public static final int UNCOMPRESSED_THROTTLE_AMOUNT = 1024; + + // Magic numbers + public static final byte GZIP_ID1 = 31; + public static final int GZIP_ID2 = 139; + + // FEXTRA flag means there are optional fields + public static final int GZIP_FLG = 4; + + // extra flags + public static final int GZIP_XFL = 0; + + // length of extra subfield + public static final short GZIP_XLEN = 6; + + // The deflate compression, which is customarily used by gzip + public static final byte GZIP_CM_DEFLATE = 8; + + // We don't care about OS because we're not doing line terminator translation + public static final int GZIP_OS_UNKNOWN = 255; + + // The subfield ID + public static final byte BGZF_ID1 = 66; + public static final byte BGZF_ID2 = 67; + + // subfield length in bytes + public static final byte BGZF_LEN = 2; +} diff --git a/lib/edu/mit/broad/sam/util/CloseableIterator.java b/lib/edu/mit/broad/sam/util/CloseableIterator.java new file mode 100755 index 0000000000..1c46127695 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/CloseableIterator.java @@ -0,0 +1,32 @@ +/* + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2008 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. + * Neither the Broad Institute nor MIT can be responsible for its use, misuse, + * or functionality. + */ +package edu.mit.broad.sam.util; + +import java.util.Iterator; + +/** + * This interface is used by iterators that use releasable resources during iteration. + * + * The consumer of a CloseableIterator should ensure that the close() method is always called, + * for example by putting such a call in a finally block. Two conventions should be followed + * by all implementors of CloseableIterator: + * 1) The close() method should be idempotent. Calling close() twice should have no effect. + * 2) When hasNext() returns false, the iterator implementation should automatically close itself. + * The latter makes it somewhat safer for consumers to use the for loop syntax for iteration: + * for (Type obj : getCloseableIterator()) { ... } + * + * We do not inherit from java.io.Closeable because IOExceptions are a pain to deal with. + */ +public interface CloseableIterator + extends Iterator { + + public void close(); +} diff --git a/lib/edu/mit/broad/sam/util/CoordMath.java b/lib/edu/mit/broad/sam/util/CoordMath.java new file mode 100644 index 0000000000..61eed6d012 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/CoordMath.java @@ -0,0 +1,75 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +public class CoordMath { + + public static long getLength(final long start, final long end) { + return (end - start) + 1; + } + + public static long getStart(final long end, final long length) { + return end - length + 1; + } + + public static long getEnd(final long start, final long length) { + return start + length - 1; + } + + /** + * Offsets are meant to exclude the 'offset' number of bases + */ + public static long getStartFromOffset(final long offset, final long length) { + return offset + 1; + } + + public static long getEndFromOffset(final long offset, final long length) { + return length - offset; + } + + public static long getLengthFromOffsets(final long startOffset, final long endOffset, final long length) { + return getLength(getStartFromOffset(startOffset, length), + getEndFromOffset(endOffset, length)); + } + + /** + * Gets a sub-sequence from a java.lang.String (which is zero based) using one based + * sequence coordinated. The base at the end coordinate will be included. + * + * @param sequence The String of base pairs + * @param begin The one based start coordinate + * @param end The one based end coordinate + * @return The subsequence specified + */ + public static String getSubsequence(final String sequence, final int begin, final int end) { + return sequence.substring(begin-1, end); + } + + /** + * Checks to see if the two sets of coordinates have any overlap. + */ + public static boolean overlaps(final long start, final long end, final long start2, final long end2) { + return (start2 >= start && start2 <= end) || (end2 >=start && end2 <= end) || + encloses(start2, end2, start, end); + } + + /** Returns true if the "inner" coords and totally enclosed by the "outer" coords. */ + public static boolean encloses(final long outerStart, final long outerEnd, final long innerStart, final long innerEnd) { + return innerStart >= outerStart && innerEnd <= outerEnd; + } + + /** + * Determines the amount of overlap between two coordinate ranges. Assumes that the two ranges + * actually do overlap and therefore may produce strange results when they do not! + */ + public static long getOverlap(final long start, final long end, final long start2, final long end2) { + return getLength(Math.max(start, start2), Math.min(end, end2)); + } +} diff --git a/lib/edu/mit/broad/sam/util/LineReader.java b/lib/edu/mit/broad/sam/util/LineReader.java new file mode 100644 index 0000000000..237444e78f --- /dev/null +++ b/lib/edu/mit/broad/sam/util/LineReader.java @@ -0,0 +1,33 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +/** + * Interface allows for implementations that read lines from a String, an ASCII file, or somewhere else. + */ +public interface LineReader { + + /** + * Read a line and remove the line terminator + */ + String readLine(); + + /** + * Read a line and optionally include the line terminator + * @param includeTerminators + * @return + */ + String readLine(boolean includeTerminators); + + /** + * @return 1-based number of line most recently read + */ + int getLineNumber(); +} diff --git a/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java b/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java new file mode 100644 index 0000000000..3490b31e35 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java @@ -0,0 +1,48 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +import java.util.Iterator; + +/** + * PeekIterator is a better class to use than this. + * @param + * @param + */ +public class NonDestructiveIterator> { + private T current = null; + private final ITERATOR underlyingIterator; + + public NonDestructiveIterator(final ITERATOR underlyingIterator) { + this.underlyingIterator = underlyingIterator; + advance(); + } + + public T getCurrent() { + return current; + } + + public ITERATOR getUnderlyingIterator() { + return underlyingIterator; + } + + public boolean advance() { + if (this.underlyingIterator.hasNext()) { + current = this.underlyingIterator.next(); + } else { + current = null; + } + return hasCurrent(); + } + + public boolean hasCurrent() { + return getCurrent() != null; + } +} diff --git a/lib/edu/mit/broad/sam/util/PeekIterator.java b/lib/edu/mit/broad/sam/util/PeekIterator.java new file mode 100644 index 0000000000..6346a10a33 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/PeekIterator.java @@ -0,0 +1,49 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +import java.util.Iterator; + +public class PeekIterator implements Iterator { + Iterator underlyingIterator; + T peekedElement = null; + + public PeekIterator(final Iterator underlyingIterator) { + this.underlyingIterator = underlyingIterator; + } + + public boolean hasNext() { + return peekedElement != null || underlyingIterator.hasNext(); + } + + public T next() { + if (peekedElement != null) { + final T ret = peekedElement; + peekedElement = null; + return ret; + } + return underlyingIterator.next(); + } + + public T peek() { + if (peekedElement == null) { + peekedElement = underlyingIterator.next(); + } + return peekedElement; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + public Iterator getUnderlyingIterator() { + return underlyingIterator; + } +} diff --git a/lib/edu/mit/broad/sam/util/RuntimeEOFException.java b/lib/edu/mit/broad/sam/util/RuntimeEOFException.java new file mode 100644 index 0000000000..ff99358f36 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/RuntimeEOFException.java @@ -0,0 +1,27 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +public class RuntimeEOFException extends RuntimeException { + public RuntimeEOFException() { + } + + public RuntimeEOFException(final String s) { + super(s); + } + + public RuntimeEOFException(final String s, final Throwable throwable) { + super(s, throwable); + } + + public RuntimeEOFException(final Throwable throwable) { + super(throwable); + } +} diff --git a/lib/edu/mit/broad/sam/util/RuntimeIOException.java b/lib/edu/mit/broad/sam/util/RuntimeIOException.java new file mode 100644 index 0000000000..b6e51bcfb7 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/RuntimeIOException.java @@ -0,0 +1,27 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2008 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +public class RuntimeIOException extends RuntimeException { + public RuntimeIOException() { + } + + public RuntimeIOException(final String s) { + super(s); + } + + public RuntimeIOException(final String s, final Throwable throwable) { + super(s, throwable); + } + + public RuntimeIOException(final Throwable throwable) { + super(throwable); + } +} diff --git a/lib/edu/mit/broad/sam/util/SortingCollection.java b/lib/edu/mit/broad/sam/util/SortingCollection.java new file mode 100644 index 0000000000..b501a08b3c --- /dev/null +++ b/lib/edu/mit/broad/sam/util/SortingCollection.java @@ -0,0 +1,369 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +import java.io.*; +import java.lang.reflect.Array; +import java.util.*; + +/** + * Collection to which many records can be added. After all records are added, the collection can be + * iterated, and the records will be returned in order defined by the comparator. Records may be spilled + * to a temporary directory if there are more records added than will fit in memory. As a result of this, + * the objects returned may not be identical to the objects added to the collection, but they should be + * equal as determined by the codec used to write them to disk and read them back. + */ +public class SortingCollection + implements Iterable { + + /** + * Client must implement this class, which defines the way in which records are written to and + * read from file. + */ + public interface Codec { + /** + * Where to write encoded output + * @param os + */ + void setOutputStream(OutputStream os); + + /** + * Where to read encoded input from + * @param is + */ + void setInputStream(InputStream is); + /** + * Write object to output stream + * @param val what to write + */ + void encode(T val); + + /** + * Read the next record from the input stream and convert into a java object. + * @return null if no more records. Should throw exception if EOF is encountered in the middle of + * a record. + */ + T decode(); + + /** + * Must return a cloned copy of the codec that can be used independently of + * the original instance. + */ + Codec clone(); + } + + /** + * Where files of sorted records go. + */ + private final File tmpDir; + private final SortingCollection.Codec codec; + private final Comparator comparator; + private final int maxRecordsInRam; + private int numRecordsInRam = 0; + private T[] ramRecords; + private boolean iterationStarted = false; + private boolean cleanedUp = false; + + /** + * List of files in tmpDir containing sorted records + */ + private final List files = new ArrayList(); + + /** + * Prepare to accumulate records to be sorted + * @param componentType Class of the record to be sorted. Necessary because of Java generic lameness. + * @param codec For writing records to file and reading them back into RAM + * @param comparator Defines output sort order + * @param maxRecordsInRam + * @param tmpDir Where to write files of records that will not fit in RAM + */ + private SortingCollection(final Class componentType, final SortingCollection.Codec codec, + final Comparator comparator, final int maxRecordsInRam, final File tmpDir) { + if (maxRecordsInRam <= 0) { + throw new IllegalArgumentException("maxRecordsInRam must be > 0"); + } + this.tmpDir = tmpDir; + this.codec = codec; + this.comparator = comparator; + this.maxRecordsInRam = maxRecordsInRam; + this.ramRecords = (T[])Array.newInstance(componentType, maxRecordsInRam); + } + + public void add(final T rec) { + if (iterationStarted) { + throw new IllegalStateException("Cannot add after calling iterator()"); + } + if (numRecordsInRam == maxRecordsInRam) { + spillToDisk(); + } + ramRecords[numRecordsInRam++] = rec; + } + + /** + * Sort the records in memory, write them to a file, and clear the buffer of records in memory. + */ + private void spillToDisk() { + try { + Arrays.sort(this.ramRecords, 0, this.numRecordsInRam, this.comparator); + final File f = File.createTempFile("sortingcollection.", ".tmp", this.tmpDir); + OutputStream os = null; + try { + os = new BufferedOutputStream(new FileOutputStream(f)); + this.codec.setOutputStream(os); + f.deleteOnExit(); + for (int i = 0; i < this.numRecordsInRam; ++i) { + this.codec.encode(ramRecords[i]); + // Facilitate GC + this.ramRecords[i] = null; + } + + os.flush(); + } + finally { + if (os != null) { + os.close(); + } + } + + this.numRecordsInRam = 0; + this.files.add(f); + + } + catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + /** + * Prepare to iterate through the records in order. This method may be called more than once, + * but add() may not be called after this method has been called. + */ + public CloseableIterator iterator() { + if (this.cleanedUp) { + throw new IllegalStateException("Cannot call iterator() after cleanup() was called."); + } + + this.iterationStarted = true; + if (this.files.isEmpty()) { + return new InMemoryIterator(); + } + + if (this.numRecordsInRam > 0) { + spillToDisk(); + } + + // Facilitate GC + this.ramRecords = null; + return new MergingIterator(); + } + + /** + * Delete any temporary files. After this method is called, iterator() may not be called. + */ + public void cleanup() { + this.iterationStarted = true; + this.cleanedUp = true; + + for (final File f : this.files) { + f.delete(); + } + } + + /** + * Syntactic sugar around the ctor, to save some typing of type parameters + * + * @param componentType Class of the record to be sorted. Necessary because of Java generic lameness. + * @param codec For writing records to file and reading them back into RAM + * @param comparator Defines output sort order + * @param maxRecordsInRAM + * @param tmpDir Where to write files of records that will not fit in RAM + */ + public static SortingCollection newInstance(final Class componentType, + final SortingCollection.Codec codec, + final Comparator comparator, + final int maxRecordsInRAM, + final File tmpDir) { + return new SortingCollection(componentType, codec, comparator, maxRecordsInRAM, tmpDir); + + } + + public static SortingCollection newInstance(final Class componentType, + final SortingCollection.Codec codec, + final Comparator comparator, + final int maxRecordsInRAM) { + + final File tmpDir = new File(System.getProperty("java.io.tmpdir")); + return new SortingCollection(componentType, codec, comparator, maxRecordsInRAM, tmpDir); + } + + /** + * For iteration when number of records added is less than the threshold for spilling to disk. + */ + class InMemoryIterator implements CloseableIterator { + private int iterationIndex = 0; + + InMemoryIterator() { + Arrays.sort(SortingCollection.this.ramRecords, + 0, + SortingCollection.this.numRecordsInRam, + SortingCollection.this.comparator); + } + + public void close() { + // nothing to do + } + + public boolean hasNext() { + return this.iterationIndex < SortingCollection.this.numRecordsInRam; + } + + public T next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + return SortingCollection.this.ramRecords[iterationIndex++]; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + /** + * For iteration when spilling to disk has occurred. + * Each file is has records in sort order within the file. + * This iterator automatically closes when it iterates to the end, but if not iterating + * to the end it is a good idea to call close(). + * + * Algorithm: MergingIterator maintains a PriorityQueue of PeekFileRecordIterators. + * Each PeekFileRecordIterator iterates through a file in which the records are sorted. + * The comparator for PeekFileRecordIterator used by the PriorityQueue peeks at the next record from + * the file, so the first element in the PriorityQueue is the file that has the next record to be emitted. + * In order to get the next record, the first PeekFileRecordIterator in the PriorityQueue is popped, + * the record is obtained from that iterator, and then if that iterator is not empty, it is pushed back into + * the PriorityQueue. Because it now has a different record as its next element, it may go into another + * location in the PriorityQueue + */ + class MergingIterator implements CloseableIterator { + private final PriorityQueue priorityQueue; + + MergingIterator() { + this.priorityQueue = new PriorityQueue(SortingCollection.this.files.size(), + new PeekFileRecordIteratorComparator()); + for (final File f : SortingCollection.this.files) { + final FileRecordIterator it = new FileRecordIterator(f); + if (it.hasNext()) { + this.priorityQueue.offer(new PeekFileRecordIterator(it)); + } + else { + it.close(); + } + } + } + + public boolean hasNext() { + return !this.priorityQueue.isEmpty(); + } + + public T next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + final PeekFileRecordIterator fileIterator = priorityQueue.poll(); + final T ret = fileIterator.next(); + if (fileIterator.hasNext()) { + this.priorityQueue.offer(fileIterator); + } + else { + ((CloseableIterator)fileIterator.getUnderlyingIterator()).close(); + } + + return ret; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + public void close() { + while (!this.priorityQueue.isEmpty()) { + final PeekFileRecordIterator it = this.priorityQueue.poll(); + ((CloseableIterator)it.getUnderlyingIterator()).close(); + } + } + } + + /** + * Read a file of records in format defined by the codec + */ + class FileRecordIterator implements CloseableIterator { + private final File file; + private final FileInputStream is; + private final Codec codec; + private T currentRecord = null; + + FileRecordIterator(final File file) { + this.file = file; + try { + this.is = new FileInputStream(file); + this.codec = SortingCollection.this.codec.clone(); + this.codec.setInputStream(this.is); + advance(); + } + catch (FileNotFoundException e) { + throw new RuntimeIOException(e); + } + } + + public boolean hasNext() { + return this.currentRecord != null; + } + + public T next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + final T ret = this.currentRecord; + advance(); + return ret; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + private void advance() { + this.currentRecord = this.codec.decode(); + } + + public void close() { + try { this.is.close(); } + catch (IOException e) { } + } + } + + + /** + * Just a typedef + */ + class PeekFileRecordIterator extends PeekIterator { + PeekFileRecordIterator(final Iterator underlyingIterator) { + super(underlyingIterator); + } + } + + class PeekFileRecordIteratorComparator implements Comparator { + + public int compare(final PeekFileRecordIterator peekFileRecordIterator, final PeekFileRecordIterator peekFileRecordIterator1) { + return comparator.compare(peekFileRecordIterator.peek(), peekFileRecordIterator1.peek()); + } + } +} diff --git a/lib/edu/mit/broad/sam/util/StringLineReader.java b/lib/edu/mit/broad/sam/util/StringLineReader.java new file mode 100644 index 0000000000..8bcaf54472 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/StringLineReader.java @@ -0,0 +1,65 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +/** + * Implementation of LineReader that gets its input from a String. No charset conversion + * is necessary because the String is in unicode. Handles CR, LF or CRLF line termination, + * but if asked to return the line terminator, it always comes back as LF. + */ +public class StringLineReader implements LineReader { + + private final String theString; + private int curPos = 0; + private int lineNumber = 0; + + public StringLineReader(final String s) { + // Simplify later processing by replacing crlf with just lf, and replacing solo cr with lf + this.theString = s.replaceAll("\r\n", "\n").replaceAll("\r", "\n"); + } + + /** + * Read a line and remove the line terminator + */ + public String readLine() { + return readLine(false); + } + + /** + * Read a line and optionally include the line terminator + * + * @param includeTerminators + * @return + */ + public String readLine(final boolean includeTerminators) { + if (curPos == theString.length()) { + return null; + } + final int nextLfIndex = theString.indexOf('\n', curPos); + if (nextLfIndex == -1) { + final int startPos = curPos; + curPos = theString.length(); + ++lineNumber; + return theString.substring(startPos); + } + final int startPos = curPos; + final int endPos = nextLfIndex + (includeTerminators? 1: 0); + curPos = nextLfIndex + 1; + ++lineNumber; + return theString.substring(startPos, endPos); + } + + /** + * @return 1-based number of line most recently read + */ + public int getLineNumber() { + return lineNumber; + } +} diff --git a/lib/edu/mit/broad/sam/util/StringUtil.java b/lib/edu/mit/broad/sam/util/StringUtil.java new file mode 100644 index 0000000000..b4ab475223 --- /dev/null +++ b/lib/edu/mit/broad/sam/util/StringUtil.java @@ -0,0 +1,136 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sam.util; + +public class StringUtil { + /** + * + * @param separator String to interject between each string in strings arg + * @param strings List of strings to be joined. + * @return String that concatenates each item of strings arg, with separator btw each of them. + */ + public static String join(final String separator, final String[] strings) { + if (strings.length == 0) { + return ""; + } + final StringBuilder ret = new StringBuilder(strings[0]); + for (int i = 1; i < strings.length; ++i) { + ret.append(separator); + ret.append(strings[i]); + } + return ret.toString(); + } + + /** + * Split the string into tokesn separated by the given delimiter. Profiling has + * revealed that the standard string.split() method typically takes > 1/2 + * the total time when used for parsing ascii files. + * + * @param aString the string to split + * @param tokens an array to hold the parsed tokens + * @param delim character that delimits tokens + * @return the number of tokens parsed + */ + public static int split(final String aString, final String[] tokens, final char delim) { + + final int maxTokens = tokens.length; + int nTokens = 0; + int start = 0; + int end = aString.indexOf(delim); + if(end < 0) { + tokens[nTokens++] = aString; + return nTokens; + } + while ((end > 0) && (nTokens < maxTokens)) + { + tokens[nTokens++] = aString.substring(start, end); + start = end + 1; + end = aString.indexOf(delim, start); + + } + // Add the trailing string, if there is room and if it is not empty. + if (nTokens < maxTokens) + { + final String trailingString = aString.substring(start); + if (trailingString.length() > 0) + { + tokens[nTokens++] = trailingString; + } + } + return nTokens; + } + + //////////////////////////////////////////////////////////////////// + // The following methods all convert btw bytes and Strings, without + // using the Java character set mechanism. + //////////////////////////////////////////////////////////////////// + + public static String bytesToString(final byte[] data) { + if (data == null) { + return null; + } + return bytesToString(data, 0, data.length); + } + + @SuppressWarnings("deprecation") + public static String bytesToString(final byte[] buffer, final int offset, final int length) { +/* + The non-deprecated way, that requires allocating char[] + final char[] charBuffer = new char[length]; + for (int i = 0; i < length; ++i) { + charBuffer[i] = (char)buffer[i+offset]; + } + return new String(charBuffer); +*/ + return new String(buffer, 0, offset, length); + } + + @SuppressWarnings("deprecation") + public static byte[] stringToBytes(final String s) { +/* + The non-deprecated way, that requires allocating char[] + final byte[] byteBuffer = new byte[s.length()]; + final char[] charBuffer = s.toCharArray(); + for (int i = 0; i < charBuffer.length; ++i) { + byteBuffer[i] = (byte)(charBuffer[i] & 0xff); + } + return byteBuffer; +*/ + final byte[] byteBuffer = new byte[s.length()]; + s.getBytes(0, byteBuffer.length, byteBuffer, 0); + return byteBuffer; + } + + // This method might more appropriately live in BinaryCodec, but all the byte <=> char conversion + // should be in the same place. + public static String readNullTerminatedString(final BinaryCodec binaryCodec) { + final StringBuilder ret = new StringBuilder(); + for (byte b = binaryCodec.readByte(); b != 0; b = binaryCodec.readByte()) { + ret.append((char)(b & 0xff)); + } + return ret.toString(); + } + + /** + * Convert chars to bytes merely by casting + * @param chars input chars + * @param charOffset where to start converting from chars array + * @param length how many chars to convert + * @param bytes where to put the converted output + * @param byteOffset where to start writing the converted output. + */ + public static void charsToBytes(final char[] chars, final int charOffset, final int length, + final byte[] bytes, final int byteOffset) { + for (int i = 0; i < length; ++i) { + bytes[byteOffset + i] = (byte)chars[charOffset + i]; + } + } + +} diff --git a/src/edu/mit/broad/sting/ValidateSAM.java b/src/edu/mit/broad/sting/ValidateSAM.java new file mode 100755 index 0000000000..237b838a6d --- /dev/null +++ b/src/edu/mit/broad/sting/ValidateSAM.java @@ -0,0 +1,105 @@ +package edu.mit.broad.sting; + +import edu.mit.broad.sam.*; +import edu.mit.broad.sam.SAMFileReader.ValidationStringency; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.cmdline.Option; + +import java.io.*; + +public class ValidateSAM extends CommandLineProgram { + // Usage and parameters + @Usage(programVersion="0.1") public String USAGE = "SAM Validator\n"; + @Option(shortName="I", doc="SAM or BAM file for validation") public File INPUT_FILE; + @Option(shortName="M", doc="Maximum number of errors to detect before exiting", optional=true) public String MAX_ERRORS_ARG = "-1"; + @Option(shortName="S", doc="How strict should we be with validation", optional=true) public String STRICTNESS_ARG = "strict"; + + private long startTime = -1; + + /** Required main method implementation. */ + public static void main(String[] argv) { + System.exit(new ValidateSAM().instanceMain(argv)); + } + + public void printProgress( int nRecords, int nErrors ) { + final double elapsed = (System.currentTimeMillis() - startTime) / 1000.0; + final double secsPer1MReads = (elapsed * 1000000.0) / nRecords; + System.out.printf("Read %d records containing %d errors in %.2f secs (%.2f secs per 1M reads)%n", nRecords, nErrors, elapsed, secsPer1MReads); + } + + protected int doWork() { + int MAX_ERRORS = -1; // Don't bail ever + if ( MAX_ERRORS_ARG != null ) { + MAX_ERRORS = Integer.parseInt(MAX_ERRORS_ARG); + } + + // Start the timer + startTime = System.currentTimeMillis(); + + // Initialize the sam reader + CloseableIterator iter = null; + try { + final SAMFileReader samReader = getSamReader(INPUT_FILE); + iter = samReader.iterator(); + } catch (Exception ioe) { + System.out.println("[VALIDATION FAILURE IN HEADER]: " + ioe); + ioe.printStackTrace(); + return 1; + } + + int nRecords = 0; + int nErrors = 0; + while ( iter.hasNext() ) { + nRecords++; + try { + final SAMRecord ri = iter.next(); + } catch (Exception ioe) { + nErrors++; + System.out.println("[VALIDATION FAILURE IN RECORD]: " + ioe); + ioe.printStackTrace(); + } + + if ( MAX_ERRORS > -1 && nErrors >= MAX_ERRORS ) { + System.out.println("Maximum number of errors encountered " + nErrors); + break; + } + + if ( nRecords % 100000 == 0 ) { + printProgress( nRecords, nErrors ); + } + } + + printProgress( nRecords, nErrors ); + return 0; + } + + private static void usage() { + System.err.println("USAGE: edu.mit.broad.sting.ValidateSAM "); + } + + private SAMFileReader getSamReader(final File samFile) { + + ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT; + if ( STRICTNESS_ARG == null ) { + strictness = SAMFileReader.ValidationStringency.STRICT; + } + else if ( STRICTNESS_ARG.toLowerCase().equals("lenient") ) { + strictness = SAMFileReader.ValidationStringency.LENIENT; + } + else if ( STRICTNESS_ARG.toLowerCase().equals("silent") ) { + strictness = SAMFileReader.ValidationStringency.SILENT; + } + else { + strictness = SAMFileReader.ValidationStringency.STRICT; + } + + System.err.println("Strictness is " + strictness); + final SAMFileReader samReader = new SAMFileReader(samFile, true); + samReader.setValidationStringency(strictness); + + return samReader; + } + +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/atk/AnalysisTK.java b/src/edu/mit/broad/sting/atk/AnalysisTK.java new file mode 100644 index 0000000000..4456d8eb79 --- /dev/null +++ b/src/edu/mit/broad/sting/atk/AnalysisTK.java @@ -0,0 +1,74 @@ +package edu.mit.broad.sting.atk; + +import edu.mit.broad.sam.SAMFileReader.ValidationStringency; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.cmdline.Option; + +import edu.mit.broad.sting.atk.modules.*; + +import java.io.*; +import java.util.HashMap; + +public class AnalysisTK extends CommandLineProgram { + // Usage and parameters + @Usage(programVersion="0.1") public String USAGE = "SAM Validator\n"; + @Option(shortName="I", doc="SAM or BAM file for validation") public File INPUT_FILE; + @Option(shortName="M", doc="Maximum number of reads to process before exiting", optional=true) public String MAX_READS_ARG = "-1"; + @Option(shortName="S", doc="How strict should we be with validation", optional=true) public String STRICTNESS_ARG = "strict"; + @Option(shortName="R", doc="Reference sequence file", optional=true) public File REF_FILE_ARG = null; + @Option(shortName="B", doc="Debugging output", optional=true) public String DEBUGGING_STR = null; + @Option(shortName="L", doc="Genome region to operation on: from chr:start-end", optional=true) public String REGION_STR = null; + @Option(shortName="T", doc="Type of analysis to run") public String AnalysisName = null; + + public static HashMap MODULES = new HashMap(); + public static void addModule(final String name, final Object walker) { + System.out.printf("* Adding module %s%n", name); + MODULES.put(name, walker); + } + + static { + addModule("EmptyLocusWalker", new EmptyLocusWalker()); + addModule("PileupWalker", new PileupWalker()); + } + + private TraversalEngine engine = null; + private int nSkippedIndels = 0; + + public boolean DEBUGGING = false; + + /** Required main method implementation. */ + public static void main(String[] argv) { + System.exit(new AnalysisTK().instanceMain(argv)); + } + + protected int doWork() { + this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG); + + ValidationStringency strictness = ValidationStringency.STRICT; + if ( STRICTNESS_ARG == null ) { + strictness = ValidationStringency.STRICT; + } + else if ( STRICTNESS_ARG.toLowerCase().equals("lenient") ) { + strictness = ValidationStringency.LENIENT; + } + else if ( STRICTNESS_ARG.toLowerCase().equals("silent") ) { + strictness = ValidationStringency.SILENT; + } + else { + strictness = ValidationStringency.STRICT; + } + System.err.println("Strictness is " + strictness); + engine.setStrictness(strictness); + + engine.setDebugging(! ( DEBUGGING_STR == null || DEBUGGING_STR.toLowerCase().equals("true"))); + engine.setMaxReads(Integer.parseInt(MAX_READS_ARG)); + + //LocusWalker walker = new EmptyLocusWalker(); + LocusWalker walker = (LocusWalker)MODULES.get(AnalysisName); + engine.initialize(); + engine.traverseByLoci(walker); + + return 0; + } +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/atk/LocusContext.java b/src/edu/mit/broad/sting/atk/LocusContext.java new file mode 100755 index 0000000000..12c246365b --- /dev/null +++ b/src/edu/mit/broad/sting/atk/LocusContext.java @@ -0,0 +1,28 @@ +package edu.mit.broad.sting.atk; + +import edu.mit.broad.sam.SAMRecord; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:01:34 PM + * To change this template use File | Settings | File Templates. + */ +public class LocusContext { + public LocusContext() { }; + + // How big is the current context? + public int getLength() { return 1; } + + // get the reference base at the current (relative) position + public byte getReferenceBase() { return 0; } + + // get all of the reads within this context + public List getReads() { return null; } + + // get a list of the equivalent positions within in the reads at Pos + public List getOffsets() { return null; } +} diff --git a/src/edu/mit/broad/sting/atk/LocusIterator.java b/src/edu/mit/broad/sting/atk/LocusIterator.java new file mode 100755 index 0000000000..83c93b8ed1 --- /dev/null +++ b/src/edu/mit/broad/sting/atk/LocusIterator.java @@ -0,0 +1,144 @@ +package edu.mit.broad.sting.atk; + +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sting.utils.PushbackIterator; +import edu.mit.broad.sting.utils.Utils; +import edu.mit.broad.sting.utils.Predicate; + +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class LocusIterator implements Iterable, CloseableIterator { + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + private final PushbackIterator it; + private String contig = null; + private int position = -1; + private List reads = new ArrayList(100); + private List offsets = new ArrayList(100); + + public String getContig() { return contig; } + public int getPosition() { return position; } + public List getReads() { return reads; } + public List getOffsets() { return offsets; } + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + public LocusIterator(final CloseableIterator samIterator) { + this.it = new PushbackIterator(samIterator); + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + return it.hasNext(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public LocusIterator next() { + position += 1; + + if ( position != -1 ) { + cleanReads(); + expandReads(); + } + + if ( reads.isEmpty() ) { + // the window is empty, we need to jump to the first pos of the first read in the stream + SAMRecord read = it.next(); + pushRead(read); + contig = read.getReferenceName(); + position = read.getAlignmentStart() - 1; + return next(); + } + else { + // at this point, window contains all reads covering the pos, we need to return them + // and the offsets into each read for this loci + calcOffsetsOfWindow(position); + return this; + } + } + + private void pushRead(SAMRecord read) { + //System.out.printf(" -> Adding read %s %d-%d flags %s%n", read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), Utils.readFlagsAsString(read)); + reads.add(read); + } + + class KeepReadPFunc implements Predicate { + public boolean apply(SAMRecord read) { + return position >= read.getAlignmentStart() && + position < read.getAlignmentEnd() && + read.getReferenceName().equals(contig); // should be index for efficiency + } + } + Predicate KeepReadP = new LocusIterator.KeepReadPFunc(); + + private void calcOffsetsOfWindow(final int position) { + offsets.clear(); + for ( SAMRecord read : reads ) { +// def calcOffset( read ): +// offset = self.pos - read.start +// return offset +// +// offsets = map(calcOffset, self.window) + final int offset = position - read.getAlignmentStart(); + assert(offset < read.getReadLength() ); + offsets.add(offset); + //System.out.printf("offsets [%d] %s%n", read.getAlignmentStart(), offsets); + } + } + + private void cleanReads() { + // def keepReadP( read ): + // return read.chr == chr and pos >= read.start and pos <= read.end + // self.window = filter( keepReadP, self.window ) + reads = Utils.filter(KeepReadP, reads); + } + + private void expandReads() { +// for read in self.rs: +// #print 'read', read, pos +// if read.chr == chr and read.start <= pos and read.end >= pos: +// self.pushRead(read) +// else: +// self.rs.unget( read ) +// #self.rs = chain( [read], self.rs ) +// break + while ( it.hasNext() ) { + SAMRecord read = it.next(); + if ( KeepReadP.apply( read ) ) { + pushRead(read); + } + else { + it.pushback(read); + break; + } + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } +} diff --git a/src/edu/mit/broad/sting/atk/LocusWalker.java b/src/edu/mit/broad/sting/atk/LocusWalker.java new file mode 100755 index 0000000000..e65891ba7b --- /dev/null +++ b/src/edu/mit/broad/sting/atk/LocusWalker.java @@ -0,0 +1,27 @@ +package edu.mit.broad.sting.atk; + +import edu.mit.broad.sting.atk.LocusIterator; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public interface LocusWalker { + void initialize(); + public String walkerType(); + + // Do we actually want to operate on the context? + boolean filter(char ref, LocusIterator context); + + // Map over the edu.mit.broad.sting.atk.LocusContext + MapType map(char ref, LocusIterator context); + + // Given result of map function + ReduceType reduceInit(); + ReduceType reduce(MapType value, ReduceType sum); + + void onTraveralDone(); +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/atk/ReadWalker.java b/src/edu/mit/broad/sting/atk/ReadWalker.java new file mode 100755 index 0000000000..6f6fa915b7 --- /dev/null +++ b/src/edu/mit/broad/sting/atk/ReadWalker.java @@ -0,0 +1,28 @@ +package edu.mit.broad.sting.atk; + +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sting.atk.LocusContext; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public interface ReadWalker { + void initialize(); + public String walkerType(); + + // Do we actually want to operate on the context? + boolean filter(LocusContext context, SAMRecord read); + + // Map over the edu.mit.broad.sting.atk.LocusContext + MapType map(LocusContext context, SAMRecord read); + + // Given result of map function + ReduceType reduceInit(); + ReduceType reduce(MapType value, ReduceType sum); + + void onTraveralDone(); +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/atk/TraversalEngine.java b/src/edu/mit/broad/sting/atk/TraversalEngine.java new file mode 100755 index 0000000000..4c1d604102 --- /dev/null +++ b/src/edu/mit/broad/sting/atk/TraversalEngine.java @@ -0,0 +1,255 @@ +package edu.mit.broad.sting.atk; + +import edu.mit.broad.sam.*; +import edu.mit.broad.sam.SAMFileReader.ValidationStringency; +import edu.mit.broad.sam.util.CloseableIterator; +import edu.mit.broad.sam.util.RuntimeIOException; +import edu.mit.broad.picard.filter.SamRecordFilter; +import edu.mit.broad.picard.filter.FilteringIterator; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.sting.utils.ReferenceIterator; + +import java.io.*; + +public class TraversalEngine { + // Usage and parameters + private File readsFile = null; + private File refFileName = null; + private String regionStr = null; + private String traversalType = null; + private ValidationStringency strictness = ValidationStringency.STRICT; + + private long startTime = -1; + private long maxReads = -1; + private long nRecords = 0; + private SAMFileReader samReader = null; + private ReferenceSequenceFile refFile = null; + private ReferenceIterator refIter = null; + private SAMFileReader readStream; + + private int nReads = 0; + private int nSkippedReads = 0; + private int nUnmappedReads = 0; + private int nNotPrimary = 0; + private int nBadAlignments = 0; + private int nSkippedIndels = 0; + + public boolean DEBUGGING = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // Setting up the engine + // + // -------------------------------------------------------------------------------------------------------------- + public TraversalEngine(File reads, File ref) { + readsFile = reads; + refFileName = ref; + } + + public void setRegion(final String reg) { regionStr = regionStr; } + public void setTraversalType(final String type) { traversalType = type; } + public void setStrictness( final ValidationStringency s ) { strictness = s; } + public void setMaxReads( final int maxReads ) { this.maxReads = maxReads; } + public void setDebugging( final boolean d ) { DEBUGGING = d; } + + // -------------------------------------------------------------------------------------------------------------- + // + // functions for dealing with the reference sequence + // + // -------------------------------------------------------------------------------------------------------------- + public void printProgress(final String type) { printProgress( false, type ); } + + public void printProgress( boolean mustPrint, final String type ) { + final long nRecords = this.nRecords; + + if ( mustPrint || nRecords % 100000 == 0 ) { + final double elapsed = (System.currentTimeMillis() - startTime) / 1000.0; + final double secsPer1MReads = (elapsed * 1000000.0) / nRecords; + System.out.printf("Traversed %d %s %.2f secs (%.2f secs per 1M %s)%n", nRecords, type, elapsed, secsPer1MReads, type); + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // functions for dealing with the reference sequence + // + // -------------------------------------------------------------------------------------------------------------- + + protected void loadReference() { + if ( refFileName!= null ) { + this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName); + this.refIter = new ReferenceIterator(this.refFile); + } + } + + protected void testReference() { + String line = ""; + refIter.seekForward("chr20", 79); + for ( int i = 0; i < this.maxReads && refIter.hasNext(); i++ ) { + final ReferenceIterator refSite = refIter.next(); + final char refBase = refSite.getBaseAsChar(); + line += refBase; + if ( (i + 1) % 80 == 0 ) { + System.out.println(line); + line = ""; + } + //System.out.printf(" Reference: %s:%d %c%n", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase); + } + System.out.println(line); + System.exit(1); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // traversal functions + // + // -------------------------------------------------------------------------------------------------------------- + protected int initialize() { + startTime = System.currentTimeMillis(); + loadReference(); + //testReference(); + //loadReference(); + readStream = initializeReadStreams(); + return 0; + } + + class locusStreamFilterFunc implements SamRecordFilter { + public boolean filterOut(SAMRecord rec) { + boolean result = false; + String why = ""; + if ( rec.getReadUnmappedFlag() ) { + nUnmappedReads++; + result = true; + why = "Unmapped"; + } + else if ( rec.getNotPrimaryAlignmentFlag() ) { + nNotPrimary++; + result = true; + why = "Not Primary"; + } + else if ( rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) { + nBadAlignments++; + result = true; + why = "No alignment start"; + } + else if ( rec.getCigar().numCigarElements() > 1 ) { + // FIXME -- deal with indels correctly! + nSkippedIndels++; + result = true; + why = "Skipping indel: " + rec.getCigarString(); + } + else { + result = false; + } + + if ( result ) { + nSkippedReads++; + //System.out.printf(" [filter] %s => %b %s%n", rec.getReadName(), result, why); + } + else { + nReads++; + } + return result; + } + } + + protected int traverseByLoci(LocusWalker walker) { + walker.initialize(); + FilteringIterator filterIter = new FilteringIterator(readStream.iterator(), new locusStreamFilterFunc()); + CloseableIterator iter = new LocusIterator(filterIter); + + T sum = walker.reduceInit(); + while ( iter.hasNext() ) { + this.nRecords++; + + // actually get the read and hand it to the walker + final LocusIterator locus = iter.next(); + final ReferenceIterator refSite = refIter.seekForward(locus.getContig(), locus.getPosition()); + final char refBase = refSite.getBaseAsChar(); + + if ( DEBUGGING ) + System.out.printf(" Reference: %s:%d %c%n", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase); + + final boolean keepMeP = walker.filter(refBase, locus); + if ( keepMeP ) { + M x = walker.map(refBase, locus); + sum = walker.reduce(x, sum); + } + + if ( this.maxReads > 0 && this.nRecords > this.maxReads ) { + System.out.println("Maximum number of reads encountered, terminating traversal " + this.nRecords); + break; + } + + printProgress("loci"); + } + + printProgress( true, "loci" ); + System.out.println("Traversal reduce result is " + sum); + System.out.printf("Traversal skipped %d reads out of %d total (%.2f%%)%n", nSkippedReads, nReads, (nSkippedReads * 100.0) / nReads); + System.out.printf(" -> %d unmapped reads%n", nUnmappedReads ); + System.out.printf(" -> %d non-primary reads%n", nNotPrimary ); + System.out.printf(" -> %d reads with bad alignments%n", nBadAlignments ); + System.out.printf(" -> %d reads with indels%n", nSkippedIndels ); + walker.onTraveralDone(); + return 0; + } + + protected int traverseByRead(ReadWalker walker) { + walker.initialize(); + CloseableIterator iter = readStream.iterator(); + R sum = walker.reduceInit(); + while ( iter.hasNext() ) { + this.nRecords++; + + // actually get the read and hand it to the walker + final SAMRecord read = iter.next(); + final boolean keepMeP = walker.filter(null, read); + if ( keepMeP ) { + M x = walker.map(null, read); + sum = walker.reduce(x, sum); + } + + if ( this.maxReads > 0 && this.nRecords > this.maxReads ) { + System.out.println("Maximum number of reads encountered, terminating traversal " + this.nRecords); + break; + } + + printProgress("reads"); + } + + printProgress( true, "reads" ); + System.out.println("Traversal reduce result is " + sum); + walker.onTraveralDone(); + return 0; + } + + // + // + // Prepare the input streams + // + // + private SAMFileReader initializeReadStreams() { + SAMFileReader reader = getSamReader(readsFile); + return reader; + } + + private SAMFileReader getSamReader(final File samFile) { + try { + final InputStream samInputStream = new FileInputStream(samFile); + final InputStream bufferedStream= new BufferedInputStream(samInputStream); + //final InputStream bufferedStream= new BufferedInputStream(samInputStream, 10000000); + final SAMFileReader samReader = new SAMFileReader(bufferedStream, true); + samReader.setValidationStringency(strictness); + + final SAMFileHeader header = samReader.getFileHeader(); + System.err.println("Sort order is: " + header.getSortOrder()); + + return samReader; + } + catch (IOException e) { + throw new RuntimeIOException(e); + } + } +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/atk/modules/EmptyLocusWalker.java b/src/edu/mit/broad/sting/atk/modules/EmptyLocusWalker.java new file mode 100755 index 0000000000..6f7c248361 --- /dev/null +++ b/src/edu/mit/broad/sting/atk/modules/EmptyLocusWalker.java @@ -0,0 +1,38 @@ +package edu.mit.broad.sting.atk.modules; + +import edu.mit.broad.sting.atk.LocusWalker; +import edu.mit.broad.sting.atk.LocusIterator; +import edu.mit.broad.sam.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:22:14 PM + * To change this template use File | Settings | File Templates. + */ +public class EmptyLocusWalker implements LocusWalker { + public void initialize() { + } + + public String walkerType() { return "ByLocus"; } + + // Do we actually want to operate on the context? + public boolean filter(char ref, LocusIterator context) { + return true; // We are keeping all the reads + } + + // Map over the edu.mit.broad.sting.atk.LocusContext + public Integer map(char ref, LocusIterator context) { + return 1; + } + + // Given result of map function + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraveralDone() { + } +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/atk/modules/PileupWalker.java b/src/edu/mit/broad/sting/atk/modules/PileupWalker.java new file mode 100644 index 0000000000..eef13ee279 --- /dev/null +++ b/src/edu/mit/broad/sting/atk/modules/PileupWalker.java @@ -0,0 +1,69 @@ +package edu.mit.broad.sting.atk.modules; + +import edu.mit.broad.sting.atk.LocusWalker; +import edu.mit.broad.sting.atk.LocusIterator; +import edu.mit.broad.sam.SAMRecord; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:22:14 PM + * To change this template use File | Settings | File Templates. + */ +public class PileupWalker implements LocusWalker { + public void initialize() { + } + + public String walkerType() { return "ByLocus"; } + + // Do we actually want to operate on the context? + public boolean filter(char ref, LocusIterator context) { + return true; // We are keeping all the reads + } + + // Map over the edu.mit.broad.sting.atk.LocusContext + public Integer map(char ref, LocusIterator context) { + //System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size()); + //for ( SAMRecord read : context.getReads() ) { + // System.out.println(" -> " + read.getReadName()); + //} + + List reads = context.getReads(); + List offsets = context.getOffsets(); + String bases = ""; + String quals = ""; + //String offsetString = ""; + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + //if ( offset >= read.getReadString().length() ) + // System.out.printf(" [%2d] [%s] %s%n", offset, read.format(), read.getReadString()); + + bases += read.getReadString().charAt(offset); + quals += read.getBaseQualityString().charAt(offset); + //offsetString += i; + //System.out.printf(" [%2d] [%s] %s%n", offset, read.getReadString().charAt(offset), read.getReadString()); + } + + if ( context.getPosition() % 10 == 0 ) + System.out.printf("%s:%d: %s %s %s%n", context.getContig(), context.getPosition(), ref, bases, quals); + + //for ( int offset : context.getOffsets() ) { + // System.out.println(" -> " + read.getReadName()); + //} + return 1; + } + + // Given result of map function + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraveralDone() { + } +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/atk/modules/ReadWalkerTest.java b/src/edu/mit/broad/sting/atk/modules/ReadWalkerTest.java new file mode 100755 index 0000000000..75dd66182c --- /dev/null +++ b/src/edu/mit/broad/sting/atk/modules/ReadWalkerTest.java @@ -0,0 +1,51 @@ +package edu.mit.broad.sting.atk.modules; + +import edu.mit.broad.sam.SAMRecord; +import edu.mit.broad.sting.atk.ReadWalker; +import edu.mit.broad.sting.atk.LocusContext; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:22:14 PM + * To change this template use File | Settings | File Templates. + */ +public class ReadWalkerTest implements ReadWalker { + long[] qualCounts = new long[100]; + + public void initialize() { + for ( int i = 0; i < this.qualCounts.length; i++ ) { + this.qualCounts[i] = 0; + } + } + + public String walkerType() { return "ByRead"; } + + // Do we actually want to operate on the context? + public boolean filter(LocusContext context, SAMRecord read) { + return true; // We are keeping all the reads + } + + // Map over the edu.mit.broad.sting.atk.LocusContext + public Integer map(LocusContext context, SAMRecord read) { + for ( byte qual : read.getBaseQualities() ) { + //System.out.println(qual); + this.qualCounts[qual]++; + } + //System.out.println(read.getReadName()); + return 1; + } + + // Given result of map function + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraveralDone() { + for ( int i = 0; i < this.qualCounts.length; i++ ) { + System.out.printf("%3d : %10d%n", i, this.qualCounts[i]); + } + } +} diff --git a/src/edu/mit/broad/sting/utils/EndlessIterator.java b/src/edu/mit/broad/sting/utils/EndlessIterator.java new file mode 100755 index 0000000000..144473986c --- /dev/null +++ b/src/edu/mit/broad/sting/utils/EndlessIterator.java @@ -0,0 +1,30 @@ +package edu.mit.broad.sting.utils; + +import java.util.Iterator; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Feb 24, 2009 + * Time: 10:24:38 AM + * To change this template use File | Settings | File Templates. + */ +public class EndlessIterator implements Iterator { + private T value; + + public EndlessIterator(T value) { + this.value = value; + } + + public boolean hasNext() { + return true; + } + + public T next() { + return this.value; + } + + public void remove () { + throw new UnsupportedOperationException(); + } +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/utils/Predicate.java b/src/edu/mit/broad/sting/utils/Predicate.java new file mode 100755 index 0000000000..1d015534e3 --- /dev/null +++ b/src/edu/mit/broad/sting/utils/Predicate.java @@ -0,0 +1,13 @@ +package edu.mit.broad.sting.utils; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Feb 24, 2009 + * Time: 10:15:19 AM + * To change this template use File | Settings | File Templates. + */ +public interface Predicate { + public boolean apply(T arg); +} + diff --git a/src/edu/mit/broad/sting/utils/PushbackIterator.java b/src/edu/mit/broad/sting/utils/PushbackIterator.java new file mode 100755 index 0000000000..37dfe6bef5 --- /dev/null +++ b/src/edu/mit/broad/sting/utils/PushbackIterator.java @@ -0,0 +1,46 @@ +/* +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2009 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ +package edu.mit.broad.sting.utils; + +import java.util.Iterator; + +public class PushbackIterator implements Iterator { + Iterator underlyingIterator; + T pushedElement = null; + + public PushbackIterator(final Iterator underlyingIterator) { + this.underlyingIterator = underlyingIterator; + } + + public boolean hasNext() { + return pushedElement != null || underlyingIterator.hasNext(); + } + + public T next() { + if (pushedElement != null) { + final T ret = pushedElement; + pushedElement = null; + return ret; + } + return underlyingIterator.next(); + } + + public void pushback(T elt) { + pushedElement = elt; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + public Iterator getUnderlyingIterator() { + return underlyingIterator; + } +} \ No newline at end of file diff --git a/src/edu/mit/broad/sting/utils/ReferenceIterator.java b/src/edu/mit/broad/sting/utils/ReferenceIterator.java new file mode 100755 index 0000000000..514d3dd203 --- /dev/null +++ b/src/edu/mit/broad/sting/utils/ReferenceIterator.java @@ -0,0 +1,148 @@ +package edu.mit.broad.sting.utils; + +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.sam.util.StringUtil; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Feb 24, 2009 + * Time: 10:45:01 AM + * To change this template use File | Settings | File Templates. + */ +public class ReferenceIterator implements Iterator { + + // The reference sequence file generator + private ReferenceSequenceFile refFile; + + private ReferenceSequence currentContig = null; + private ReferenceSequence nextContig = null; + private int offset = -1; + + public ReferenceIterator( ReferenceSequenceFile refFile ) { + this.refFile = refFile; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // Accessing data + // + // -------------------------------------------------------------------------------------------------------------- + public byte getBaseAsByte() { return currentContig.getBases()[offset]; } + public String getBaseAsString() { return StringUtil.bytesToString(currentContig.getBases(), offset, 1); } + public char getBaseAsChar() { return getBaseAsString().charAt(0); } + public ReferenceSequence getCurrentContig() { return currentContig; } + public int getPosition() { return offset + 1; } + + // -------------------------------------------------------------------------------------------------------------- + // + // Iterator routines + // + // -------------------------------------------------------------------------------------------------------------- + public boolean hasNext() { + if ( currentContig == null || offset + 1 < currentContig.length() ) { + return true; + } + else { + return loadNextContig(); + } + } + + public ReferenceIterator next() { + if ( currentContig != null ) { + //System.out.printf(" -> %s:%d %d%n", currentContig.getName(), offset, currentContig.length()); + } + offset++; // move on to the next position + + if ( currentContig == null || offset >= currentContig.length() ) { + // We need to update the contig + //System.out.printf(" -> Updating length%n"); + if ( nextContig != null ) { + // We've already loaded the next contig, swap it in, and recursively call next + swapNextContig(); + return next(); + } + else if ( loadNextContig() ){ + // We sucessfully loaded the next contig, recursively call next + offset = -1; + return next(); + } + else { + throw new NoSuchElementException(); + } + } + else { + // We're good to go -- we're in the current contig + return this; + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // Jumping forward + // + // -------------------------------------------------------------------------------------------------------------- + public ReferenceIterator seekForward(final String contigName, final int pos) { + return seekForwardOffset(contigName, pos - 1); + } + + private ReferenceIterator seekForwardOffset(final String contigName, final int seekOffset) { + // jumps us forward in the sequence to the contig / pos + if ( currentContig == null ) + next(); + + //System.out.printf(" -> Seeking to %s %d from %s %d%n", contigName, seekOffset, currentContig.getName(), offset); + if ( contigName.equals(currentContig.getName()) ) { + // we're somewhere on this contig + if ( seekOffset < offset || seekOffset >= currentContig.length() ) { + // bad boy -- can't go backward safely or just beyond the contig length + throw new IllegalArgumentException("Bad seek to " + seekOffset + " current: " + offset); + //return null; + } + else { + offset = seekOffset - 1; + return next(); + } + } + else { + while (true) { + // go searching through the reference + if ( ! loadNextContig() ) { + // never found anything + return null; + } + else if ( nextContig.getName().equals(contigName) ) { + swapNextContig(); + return seekForward(contigName, seekOffset); + } + } + } + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // Interal state manipulation + // + // -------------------------------------------------------------------------------------------------------------- + protected boolean loadNextContig() { + // returns true if we had another contig to load + nextContig = refFile.nextSequence(); + return nextContig != null; + } + + protected void swapNextContig() { + currentContig = nextContig; + nextContig = null; + offset = -1; + } +} diff --git a/src/edu/mit/broad/sting/utils/Utils.java b/src/edu/mit/broad/sting/utils/Utils.java new file mode 100755 index 0000000000..094893448a --- /dev/null +++ b/src/edu/mit/broad/sting/utils/Utils.java @@ -0,0 +1,55 @@ +package edu.mit.broad.sting.utils; + +import edu.mit.broad.sam.SAMRecord; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Feb 24, 2009 + * Time: 10:12:31 AM + * To change this template use File | Settings | File Templates. + */ +public class Utils { + public static List filter(Predicate pred, Collection c) { + List filtered = new ArrayList(); + // loop through all the elements in c + for (T obj : c) { + // if the predicate is true for the current element + if (pred.apply(obj)) { + // append it to the result list + filtered.add(obj); + } + } + return filtered; + } + + private static final Map readFlagNames + = new HashMap(); + + static { + readFlagNames.put(0x1, "Paired"); + readFlagNames.put(0x2, "Proper"); + readFlagNames.put(0x4, "Unmapped"); + readFlagNames.put(0x8, "MateUnmapped"); + readFlagNames.put(0x10, "Forward"); + //readFlagNames.put(0x20, "MateForward"); + readFlagNames.put(0x4, "FirstOfPair"); + readFlagNames.put(0x8, "SecondOfPair"); + readFlagNames.put(0x100, "NotPrimary"); + readFlagNames.put(0x200, "NON-PF"); + readFlagNames.put(0x400, "Duplicate"); + } + + public static String readFlagsAsString(SAMRecord rec) { + String flags = ""; + for ( int flag : readFlagNames.keySet() ) { + if ( ( rec.getFlags() & flag ) != 0 ) { + flags += readFlagNames.get(flag) + " "; + } + } + return flags; + } + +} diff --git a/src/scripts/TraverseTest.sh b/src/scripts/TraverseTest.sh new file mode 100755 index 0000000000..3238c40438 --- /dev/null +++ b/src/scripts/TraverseTest.sh @@ -0,0 +1 @@ +java -cp out/production/AnalysisTK:../../jars/broad.jar edu.mit.broad.sting.atk.AnalysisTK $* diff --git a/src/scripts/TraverseTestProf.sh b/src/scripts/TraverseTestProf.sh new file mode 100755 index 0000000000..ff10a50cd4 --- /dev/null +++ b/src/scripts/TraverseTestProf.sh @@ -0,0 +1 @@ +java -agentlib:hprof=cpu=samples -cp out/production/AnalysisTK:../../jars/broad.jar edu.mit.broad.sting.atk.AnalysisTK $*