|
| 1 | +package picard.sam; |
| 2 | + |
| 3 | +import htsjdk.samtools.SAMReadGroupRecord; |
| 4 | +import htsjdk.samtools.SAMRecord; |
| 5 | +import htsjdk.samtools.fastq.FastqRecord; |
| 6 | +import htsjdk.samtools.fastq.FastqWriter; |
| 7 | +import htsjdk.samtools.fastq.FastqWriterFactory; |
| 8 | +import htsjdk.samtools.util.IOUtil; |
| 9 | +import htsjdk.samtools.util.Log; |
| 10 | +import org.apache.commons.lang3.StringUtils; |
| 11 | +import org.broadinstitute.barclay.argparser.Argument; |
| 12 | +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; |
| 13 | +import picard.PicardException; |
| 14 | +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; |
| 15 | + |
| 16 | +import java.io.File; |
| 17 | +import java.util.*; |
| 18 | +import java.util.stream.Collectors; |
| 19 | + |
| 20 | +/** |
| 21 | + * <p> Extracts read sequences and qualities from the input SAM/BAM file and SAM/BAM tags and writes them into |
| 22 | + * output files in Sanger FASTQ format. |
| 23 | + * See <a href="http://maq.sourceforge.net/fastq.shtml">MAQ FASTQ specification</a> for details. |
| 24 | + * <br /> |
| 25 | + * <h4>Usage example:</h4> |
| 26 | + * <pre> |
| 27 | + * java -jar picard.jar SamToFastqWithTags |
| 28 | + * I=input.bam |
| 29 | + * FASTQ=output.fastq |
| 30 | + * SEQUENCE_TAG_GROUP="CR" |
| 31 | + * QUALITY_TAG_GROUP="CY" |
| 32 | + * SEQUENCE_TAG_GROUP="CB,UR" |
| 33 | + * QUALITY_TAG_GROUP="CY,UY" |
| 34 | + * </pre> |
| 35 | + * <hr /> |
| 36 | + */ |
| 37 | +@CommandLineProgramProperties( |
| 38 | + summary = SamToFastqWithTags.USAGE_SUMMARY + SamToFastqWithTags.USAGE_DETAILS, |
| 39 | + oneLineSummary = SamToFastqWithTags.USAGE_SUMMARY, |
| 40 | + programGroup = ReadDataManipulationProgramGroup.class) |
| 41 | +public class SamToFastqWithTags extends SamToFastq { |
| 42 | + static final String USAGE_SUMMARY = "Converts a SAM or BAM file to FASTQ alongside FASTQs created from tags."; |
| 43 | + static final String USAGE_DETAILS = " Extracts read sequences and qualities from the input SAM/BAM file and SAM/BAM tags" + |
| 44 | + " and writes them into the output file in Sanger FASTQ format." + |
| 45 | + " See <a href=\"http://maq.sourceforge.net/fastq.shtml\">MAQ FASTQ specification</a> for details.<br /> <br />" + |
| 46 | + "The following example will create two FASTQs from tags. One will be converted with the base sequence coming from " + |
| 47 | + "the \"CR\" tag and base quality from the \"CY\" tag. The other fastq will be converted with the base sequence coming" + |
| 48 | + " from the \"CB\" and \"UR\" tags concatenated together with no separator (not specified on command line) with the base" + |
| 49 | + " qualities coming from the \"CY\" and \"UY\" tags concatenated together. The two files will be named CR.fastq" + |
| 50 | + " and CB_UR.fastq." + |
| 51 | + "<br />" + |
| 52 | + "<pre>" + |
| 53 | + "java -jar picard.jar SamToFastqWithTags <br />" + |
| 54 | + " I=input.bam<br />" + |
| 55 | + " FASTQ=output.fastq<br />" + |
| 56 | + " SEQUENCE_TAG_GROUP=CR<br />" + |
| 57 | + " QUALITY_TAG_GROUP=CY<br />" + |
| 58 | + " SEQUENCE_TAG_GROUP=\"CB,UR\"<br />" + |
| 59 | + " QUALITY_TAG_GROUP=\"CY,UY\"" + |
| 60 | + "</pre>" + |
| 61 | + "<hr />"; |
| 62 | + |
| 63 | + @Argument(shortName = "STG", doc = "List of comma separated tag values to extract from Input SAM/BAM to be used as read sequence", minElements = 1) |
| 64 | + public List<String> SEQUENCE_TAG_GROUP; |
| 65 | + |
| 66 | + @Argument(shortName = "QTG", doc = "List of comma separated tag values to extract from Input SAM/BAM to be used as read qualities", optional = true) |
| 67 | + public List<String> QUALITY_TAG_GROUP; |
| 68 | + |
| 69 | + @Argument(shortName = "SEP", doc = "List of any sequences (e.g. 'AACCTG`) to put in between each comma separated list of sequence tags in each SEQUENCE_TAG_GROUP (STG)", optional = true) |
| 70 | + public List<String> TAG_GROUP_SEPERATOR; |
| 71 | + |
| 72 | + @Argument(shortName = "GZOPTG", doc = "Compress output FASTQ files per Tag grouping using gzip and append a .gz extension to the file names.") |
| 73 | + public Boolean COMPRESS_OUTPUTS_PER_TAG_GROUP = false; |
| 74 | + |
| 75 | + private final Log log = Log.getInstance(SamToFastqWithTags.class); |
| 76 | + |
| 77 | + private static final String TAG_SPLIT_DEFAULT_SEP = ""; |
| 78 | + private static final String TAG_SPLIT_QUAL = "~"; |
| 79 | + |
| 80 | + private ArrayList<String[]> SPLIT_SEQUENCE_TAGS; |
| 81 | + private ArrayList<String[]> SPLIT_QUALITY_TAGS; |
| 82 | + private ArrayList<String> SPLIT_SEPARATOR_TAGS; |
| 83 | + |
| 84 | + @Override |
| 85 | + protected void initializeAdditionalWriters() { |
| 86 | + setupTagSplitValues(); |
| 87 | + } |
| 88 | + |
| 89 | + @Override |
| 90 | + protected void handleAdditionalRecords(SAMRecord currentRecord, Map<SAMReadGroupRecord, List<FastqWriter>> tagWriters, SAMRecord read1, SAMRecord read2) { |
| 91 | + final List<FastqWriter> rgTagWriters = tagWriters.get(currentRecord.getReadGroup()); |
| 92 | + if (currentRecord.getReadPairedFlag()) { |
| 93 | + if (read1 != null && read2 !=null) { |
| 94 | + writeTagRecords(read1, 1, rgTagWriters); |
| 95 | + writeTagRecords(read2, 2, rgTagWriters); |
| 96 | + } |
| 97 | + } else { |
| 98 | + writeTagRecords(currentRecord, null, rgTagWriters); |
| 99 | + } |
| 100 | + } |
| 101 | + |
| 102 | + @Override |
| 103 | + protected Map<SAMReadGroupRecord, List<FastqWriter>> generateAdditionalWriters(List<SAMReadGroupRecord> readGroups, FastqWriterFactory factory) { |
| 104 | + return generateTagWriters(readGroups, factory); |
| 105 | + } |
| 106 | + |
| 107 | + // generate writers |
| 108 | + private Map<SAMReadGroupRecord, List<FastqWriter>> generateTagWriters(final List<SAMReadGroupRecord> samReadGroupRecords, |
| 109 | + final FastqWriterFactory factory) { |
| 110 | + final Map<SAMReadGroupRecord, List<FastqWriter>> writerMap = new HashMap<>(); |
| 111 | + |
| 112 | + if (!OUTPUT_PER_RG) { |
| 113 | + /* Prepare tag writers based on sequence tag groups provided in command line */ |
| 114 | + |
| 115 | + final List<FastqWriter> tagFastqWriters = makeTagWriters(null, factory); |
| 116 | + |
| 117 | + writerMap.put(null, tagFastqWriters); |
| 118 | + for (final SAMReadGroupRecord rg : samReadGroupRecords) { |
| 119 | + writerMap.put(rg, tagFastqWriters); |
| 120 | + } |
| 121 | + } else { |
| 122 | + /* prepare tag writers based on readgroup names */ |
| 123 | + for (final SAMReadGroupRecord rg : samReadGroupRecords) { |
| 124 | + final List<FastqWriter> tagWriters = makeTagWriters(rg, factory); |
| 125 | + |
| 126 | + writerMap.put(rg, tagWriters); |
| 127 | + } |
| 128 | + } |
| 129 | + return writerMap; |
| 130 | + } |
| 131 | + |
| 132 | + /** |
| 133 | + * Creates fastq writers based on readgroup passed in and sequence tag groupings from command line |
| 134 | + */ |
| 135 | + private List<FastqWriter> makeTagWriters(final SAMReadGroupRecord readGroup, final FastqWriterFactory factory) { |
| 136 | + String baseFilename = null; |
| 137 | + if (readGroup != null) { |
| 138 | + if (RG_TAG.equalsIgnoreCase("PU")) { |
| 139 | + baseFilename = readGroup.getPlatformUnit() + "_"; |
| 140 | + } else if (RG_TAG.equalsIgnoreCase("ID")) { |
| 141 | + baseFilename = readGroup.getReadGroupId() + "_"; |
| 142 | + } |
| 143 | + if (baseFilename == null) { |
| 144 | + throw new PicardException("The selected RG_TAG: " + RG_TAG + " is not present in the bam header."); |
| 145 | + } |
| 146 | + } else { |
| 147 | + baseFilename = ""; |
| 148 | + } |
| 149 | + |
| 150 | + List<File> tagFiles = new ArrayList<>(); |
| 151 | + for (String tagSplit : SEQUENCE_TAG_GROUP) { |
| 152 | + String fileName = baseFilename + tagSplit.replace(",", "_"); |
| 153 | + fileName = IOUtil.makeFileNameSafe(fileName); |
| 154 | + fileName += COMPRESS_OUTPUTS_PER_TAG_GROUP ? ".fastq.gz" : ".fastq"; |
| 155 | + |
| 156 | + final File result = (OUTPUT_DIR != null) |
| 157 | + ? new File(OUTPUT_DIR, fileName) |
| 158 | + : new File(FASTQ.getParent(), fileName); |
| 159 | + IOUtil.assertFileIsWritable(result); |
| 160 | + tagFiles.add(result); |
| 161 | + } |
| 162 | + return tagFiles.stream().map(factory::newWriter).collect(Collectors.toList()); |
| 163 | + } |
| 164 | + |
| 165 | + // Sets up the Groupings of Sequence Tags, Quality Tags, and Separator Strings so we dont have to calculate them for every loop |
| 166 | + private void setupTagSplitValues() { |
| 167 | + SPLIT_SEQUENCE_TAGS = new ArrayList<>(); |
| 168 | + SPLIT_QUALITY_TAGS = new ArrayList<>(); |
| 169 | + SPLIT_SEPARATOR_TAGS = new ArrayList<>(); |
| 170 | + |
| 171 | + for (int i = 0; i < SEQUENCE_TAG_GROUP.size(); i++) { |
| 172 | + SPLIT_SEQUENCE_TAGS.add(SEQUENCE_TAG_GROUP.get(i).trim().split(",")); |
| 173 | + SPLIT_QUALITY_TAGS.add(QUALITY_TAG_GROUP.isEmpty() ? null : QUALITY_TAG_GROUP.get(i).trim().split(",")); |
| 174 | + SPLIT_SEPARATOR_TAGS.add(TAG_GROUP_SEPERATOR.isEmpty() ? TAG_SPLIT_DEFAULT_SEP : TAG_GROUP_SEPERATOR.get(i)); |
| 175 | + } |
| 176 | + } |
| 177 | + |
| 178 | + private void writeTagRecords(final SAMRecord read, final Integer mateNumber, final List<FastqWriter> tagWriters) { |
| 179 | + if (SEQUENCE_TAG_GROUP.isEmpty()) { |
| 180 | + return; |
| 181 | + } |
| 182 | + |
| 183 | + final String seqHeader = mateNumber == null ? read.getReadName() : read.getReadName() + "/" + mateNumber; |
| 184 | + |
| 185 | + for (int i = 0; i < SEQUENCE_TAG_GROUP.size(); i++) { |
| 186 | + final String tmpTagSep = SPLIT_SEPARATOR_TAGS.get(i); |
| 187 | + final String[] sequenceTagsToWrite = SPLIT_SEQUENCE_TAGS.get(i); |
| 188 | + final String newSequence = String.join(tmpTagSep, Arrays.stream(sequenceTagsToWrite) |
| 189 | + .map(tag -> assertTagExists(read, tag)) |
| 190 | + .collect(Collectors.toList())); |
| 191 | + |
| 192 | + final String tmpQualSep = StringUtils.repeat(TAG_SPLIT_QUAL, tmpTagSep.length()); |
| 193 | + final String[] qualityTagsToWrite = SPLIT_QUALITY_TAGS.get(i); |
| 194 | + final String newQual = QUALITY_TAG_GROUP.isEmpty() ? StringUtils.repeat(TAG_SPLIT_QUAL, newSequence.length()) : |
| 195 | + String.join(tmpQualSep, Arrays.stream(qualityTagsToWrite) |
| 196 | + .map(tag -> assertTagExists(read, tag)) |
| 197 | + .collect(Collectors.toList())); |
| 198 | + FastqWriter writer = tagWriters.get(i); |
| 199 | + writer.write(new FastqRecord(seqHeader, newSequence, "", newQual)); |
| 200 | + } |
| 201 | + } |
| 202 | + |
| 203 | + private String assertTagExists(final SAMRecord record, final String tag) { |
| 204 | + String value = record.getStringAttribute(tag); |
| 205 | + if (value == null) { |
| 206 | + throw new PicardException("Record: " + record.getReadName() + " does have a value for tag: " + tag); |
| 207 | + } |
| 208 | + return value; |
| 209 | + } |
| 210 | + |
| 211 | + @Override |
| 212 | + protected String[] customCommandLineValidation() { |
| 213 | + List<String> errors = new ArrayList<>(); |
| 214 | + |
| 215 | + if (!QUALITY_TAG_GROUP.isEmpty() && SEQUENCE_TAG_GROUP.size() != QUALITY_TAG_GROUP.size()) { |
| 216 | + errors.add("QUALITY_TAG_GROUP size must be equal to SEQUENCE_TAG_GROUP or not be specified at all."); |
| 217 | + } |
| 218 | + |
| 219 | + if (!TAG_GROUP_SEPERATOR.isEmpty() && SEQUENCE_TAG_GROUP.size() != TAG_GROUP_SEPERATOR.size()) { |
| 220 | + errors.add("TAG_GROUP_SEPERATOR size must be equal to SEQUENCE_TAG_GROUP or not be specified at all."); |
| 221 | + } |
| 222 | + |
| 223 | + return errors.isEmpty() ? super.customCommandLineValidation() : errors.toArray(new String[errors.size()]); |
| 224 | + } |
| 225 | +} |
0 commit comments