Skip to content

Commit 3984f99

Browse files
jacareyjsotobroad
authored andcommitted
Refactor SamToFastq and add SamToFastqWithTags (broadinstitute#1110)
1 parent 0647af3 commit 3984f99

File tree

7 files changed

+643
-174
lines changed

7 files changed

+643
-174
lines changed

src/main/java/picard/sam/SamToFastq.java

+139-116
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
package picard.sam;
2+
3+
import htsjdk.samtools.SAMReadGroupRecord;
4+
import htsjdk.samtools.SAMRecord;
5+
import htsjdk.samtools.fastq.FastqRecord;
6+
import htsjdk.samtools.fastq.FastqWriter;
7+
import htsjdk.samtools.fastq.FastqWriterFactory;
8+
import htsjdk.samtools.util.IOUtil;
9+
import htsjdk.samtools.util.Log;
10+
import org.apache.commons.lang3.StringUtils;
11+
import org.broadinstitute.barclay.argparser.Argument;
12+
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
13+
import picard.PicardException;
14+
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
15+
16+
import java.io.File;
17+
import java.util.*;
18+
import java.util.stream.Collectors;
19+
20+
/**
21+
* <p> Extracts read sequences and qualities from the input SAM/BAM file and SAM/BAM tags and writes them into
22+
* output files in Sanger FASTQ format.
23+
* See <a href="http://maq.sourceforge.net/fastq.shtml">MAQ FASTQ specification</a> for details.
24+
* <br />
25+
* <h4>Usage example:</h4>
26+
* <pre>
27+
* java -jar picard.jar SamToFastqWithTags
28+
* I=input.bam
29+
* FASTQ=output.fastq
30+
* SEQUENCE_TAG_GROUP="CR"
31+
* QUALITY_TAG_GROUP="CY"
32+
* SEQUENCE_TAG_GROUP="CB,UR"
33+
* QUALITY_TAG_GROUP="CY,UY"
34+
* </pre>
35+
* <hr />
36+
*/
37+
@CommandLineProgramProperties(
38+
summary = SamToFastqWithTags.USAGE_SUMMARY + SamToFastqWithTags.USAGE_DETAILS,
39+
oneLineSummary = SamToFastqWithTags.USAGE_SUMMARY,
40+
programGroup = ReadDataManipulationProgramGroup.class)
41+
public class SamToFastqWithTags extends SamToFastq {
42+
static final String USAGE_SUMMARY = "Converts a SAM or BAM file to FASTQ alongside FASTQs created from tags.";
43+
static final String USAGE_DETAILS = " Extracts read sequences and qualities from the input SAM/BAM file and SAM/BAM tags" +
44+
" and writes them into the output file in Sanger FASTQ format." +
45+
" See <a href=\"http://maq.sourceforge.net/fastq.shtml\">MAQ FASTQ specification</a> for details.<br /> <br />" +
46+
"The following example will create two FASTQs from tags. One will be converted with the base sequence coming from " +
47+
"the \"CR\" tag and base quality from the \"CY\" tag. The other fastq will be converted with the base sequence coming" +
48+
" from the \"CB\" and \"UR\" tags concatenated together with no separator (not specified on command line) with the base" +
49+
" qualities coming from the \"CY\" and \"UY\" tags concatenated together. The two files will be named CR.fastq" +
50+
" and CB_UR.fastq." +
51+
"<br />" +
52+
"<pre>" +
53+
"java -jar picard.jar SamToFastqWithTags <br />" +
54+
" I=input.bam<br />" +
55+
" FASTQ=output.fastq<br />" +
56+
" SEQUENCE_TAG_GROUP=CR<br />" +
57+
" QUALITY_TAG_GROUP=CY<br />" +
58+
" SEQUENCE_TAG_GROUP=\"CB,UR\"<br />" +
59+
" QUALITY_TAG_GROUP=\"CY,UY\"" +
60+
"</pre>" +
61+
"<hr />";
62+
63+
@Argument(shortName = "STG", doc = "List of comma separated tag values to extract from Input SAM/BAM to be used as read sequence", minElements = 1)
64+
public List<String> SEQUENCE_TAG_GROUP;
65+
66+
@Argument(shortName = "QTG", doc = "List of comma separated tag values to extract from Input SAM/BAM to be used as read qualities", optional = true)
67+
public List<String> QUALITY_TAG_GROUP;
68+
69+
@Argument(shortName = "SEP", doc = "List of any sequences (e.g. 'AACCTG`) to put in between each comma separated list of sequence tags in each SEQUENCE_TAG_GROUP (STG)", optional = true)
70+
public List<String> TAG_GROUP_SEPERATOR;
71+
72+
@Argument(shortName = "GZOPTG", doc = "Compress output FASTQ files per Tag grouping using gzip and append a .gz extension to the file names.")
73+
public Boolean COMPRESS_OUTPUTS_PER_TAG_GROUP = false;
74+
75+
private final Log log = Log.getInstance(SamToFastqWithTags.class);
76+
77+
private static final String TAG_SPLIT_DEFAULT_SEP = "";
78+
private static final String TAG_SPLIT_QUAL = "~";
79+
80+
private ArrayList<String[]> SPLIT_SEQUENCE_TAGS;
81+
private ArrayList<String[]> SPLIT_QUALITY_TAGS;
82+
private ArrayList<String> SPLIT_SEPARATOR_TAGS;
83+
84+
@Override
85+
protected void initializeAdditionalWriters() {
86+
setupTagSplitValues();
87+
}
88+
89+
@Override
90+
protected void handleAdditionalRecords(SAMRecord currentRecord, Map<SAMReadGroupRecord, List<FastqWriter>> tagWriters, SAMRecord read1, SAMRecord read2) {
91+
final List<FastqWriter> rgTagWriters = tagWriters.get(currentRecord.getReadGroup());
92+
if (currentRecord.getReadPairedFlag()) {
93+
if (read1 != null && read2 !=null) {
94+
writeTagRecords(read1, 1, rgTagWriters);
95+
writeTagRecords(read2, 2, rgTagWriters);
96+
}
97+
} else {
98+
writeTagRecords(currentRecord, null, rgTagWriters);
99+
}
100+
}
101+
102+
@Override
103+
protected Map<SAMReadGroupRecord, List<FastqWriter>> generateAdditionalWriters(List<SAMReadGroupRecord> readGroups, FastqWriterFactory factory) {
104+
return generateTagWriters(readGroups, factory);
105+
}
106+
107+
// generate writers
108+
private Map<SAMReadGroupRecord, List<FastqWriter>> generateTagWriters(final List<SAMReadGroupRecord> samReadGroupRecords,
109+
final FastqWriterFactory factory) {
110+
final Map<SAMReadGroupRecord, List<FastqWriter>> writerMap = new HashMap<>();
111+
112+
if (!OUTPUT_PER_RG) {
113+
/* Prepare tag writers based on sequence tag groups provided in command line */
114+
115+
final List<FastqWriter> tagFastqWriters = makeTagWriters(null, factory);
116+
117+
writerMap.put(null, tagFastqWriters);
118+
for (final SAMReadGroupRecord rg : samReadGroupRecords) {
119+
writerMap.put(rg, tagFastqWriters);
120+
}
121+
} else {
122+
/* prepare tag writers based on readgroup names */
123+
for (final SAMReadGroupRecord rg : samReadGroupRecords) {
124+
final List<FastqWriter> tagWriters = makeTagWriters(rg, factory);
125+
126+
writerMap.put(rg, tagWriters);
127+
}
128+
}
129+
return writerMap;
130+
}
131+
132+
/**
133+
* Creates fastq writers based on readgroup passed in and sequence tag groupings from command line
134+
*/
135+
private List<FastqWriter> makeTagWriters(final SAMReadGroupRecord readGroup, final FastqWriterFactory factory) {
136+
String baseFilename = null;
137+
if (readGroup != null) {
138+
if (RG_TAG.equalsIgnoreCase("PU")) {
139+
baseFilename = readGroup.getPlatformUnit() + "_";
140+
} else if (RG_TAG.equalsIgnoreCase("ID")) {
141+
baseFilename = readGroup.getReadGroupId() + "_";
142+
}
143+
if (baseFilename == null) {
144+
throw new PicardException("The selected RG_TAG: " + RG_TAG + " is not present in the bam header.");
145+
}
146+
} else {
147+
baseFilename = "";
148+
}
149+
150+
List<File> tagFiles = new ArrayList<>();
151+
for (String tagSplit : SEQUENCE_TAG_GROUP) {
152+
String fileName = baseFilename + tagSplit.replace(",", "_");
153+
fileName = IOUtil.makeFileNameSafe(fileName);
154+
fileName += COMPRESS_OUTPUTS_PER_TAG_GROUP ? ".fastq.gz" : ".fastq";
155+
156+
final File result = (OUTPUT_DIR != null)
157+
? new File(OUTPUT_DIR, fileName)
158+
: new File(FASTQ.getParent(), fileName);
159+
IOUtil.assertFileIsWritable(result);
160+
tagFiles.add(result);
161+
}
162+
return tagFiles.stream().map(factory::newWriter).collect(Collectors.toList());
163+
}
164+
165+
// Sets up the Groupings of Sequence Tags, Quality Tags, and Separator Strings so we dont have to calculate them for every loop
166+
private void setupTagSplitValues() {
167+
SPLIT_SEQUENCE_TAGS = new ArrayList<>();
168+
SPLIT_QUALITY_TAGS = new ArrayList<>();
169+
SPLIT_SEPARATOR_TAGS = new ArrayList<>();
170+
171+
for (int i = 0; i < SEQUENCE_TAG_GROUP.size(); i++) {
172+
SPLIT_SEQUENCE_TAGS.add(SEQUENCE_TAG_GROUP.get(i).trim().split(","));
173+
SPLIT_QUALITY_TAGS.add(QUALITY_TAG_GROUP.isEmpty() ? null : QUALITY_TAG_GROUP.get(i).trim().split(","));
174+
SPLIT_SEPARATOR_TAGS.add(TAG_GROUP_SEPERATOR.isEmpty() ? TAG_SPLIT_DEFAULT_SEP : TAG_GROUP_SEPERATOR.get(i));
175+
}
176+
}
177+
178+
private void writeTagRecords(final SAMRecord read, final Integer mateNumber, final List<FastqWriter> tagWriters) {
179+
if (SEQUENCE_TAG_GROUP.isEmpty()) {
180+
return;
181+
}
182+
183+
final String seqHeader = mateNumber == null ? read.getReadName() : read.getReadName() + "/" + mateNumber;
184+
185+
for (int i = 0; i < SEQUENCE_TAG_GROUP.size(); i++) {
186+
final String tmpTagSep = SPLIT_SEPARATOR_TAGS.get(i);
187+
final String[] sequenceTagsToWrite = SPLIT_SEQUENCE_TAGS.get(i);
188+
final String newSequence = String.join(tmpTagSep, Arrays.stream(sequenceTagsToWrite)
189+
.map(tag -> assertTagExists(read, tag))
190+
.collect(Collectors.toList()));
191+
192+
final String tmpQualSep = StringUtils.repeat(TAG_SPLIT_QUAL, tmpTagSep.length());
193+
final String[] qualityTagsToWrite = SPLIT_QUALITY_TAGS.get(i);
194+
final String newQual = QUALITY_TAG_GROUP.isEmpty() ? StringUtils.repeat(TAG_SPLIT_QUAL, newSequence.length()) :
195+
String.join(tmpQualSep, Arrays.stream(qualityTagsToWrite)
196+
.map(tag -> assertTagExists(read, tag))
197+
.collect(Collectors.toList()));
198+
FastqWriter writer = tagWriters.get(i);
199+
writer.write(new FastqRecord(seqHeader, newSequence, "", newQual));
200+
}
201+
}
202+
203+
private String assertTagExists(final SAMRecord record, final String tag) {
204+
String value = record.getStringAttribute(tag);
205+
if (value == null) {
206+
throw new PicardException("Record: " + record.getReadName() + " does have a value for tag: " + tag);
207+
}
208+
return value;
209+
}
210+
211+
@Override
212+
protected String[] customCommandLineValidation() {
213+
List<String> errors = new ArrayList<>();
214+
215+
if (!QUALITY_TAG_GROUP.isEmpty() && SEQUENCE_TAG_GROUP.size() != QUALITY_TAG_GROUP.size()) {
216+
errors.add("QUALITY_TAG_GROUP size must be equal to SEQUENCE_TAG_GROUP or not be specified at all.");
217+
}
218+
219+
if (!TAG_GROUP_SEPERATOR.isEmpty() && SEQUENCE_TAG_GROUP.size() != TAG_GROUP_SEPERATOR.size()) {
220+
errors.add("TAG_GROUP_SEPERATOR size must be equal to SEQUENCE_TAG_GROUP or not be specified at all.");
221+
}
222+
223+
return errors.isEmpty() ? super.customCommandLineValidation() : errors.toArray(new String[errors.size()]);
224+
}
225+
}

src/test/java/picard/sam/SamToFastqTest.java

+23-38
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ public Object[][] okFiles() {
6868
};
6969
}
7070

71-
7271
@DataProvider(name = "badFiles")
7372
public Object[][] badFiles() {
7473
return new Object[][] {
@@ -146,7 +145,7 @@ public void testOkFile(final String samFilename) throws IOException {
146145
verifyFastq(pair1File, pair2File, samFile);
147146
}
148147

149-
@Test(dataProvider = "okFiles")
148+
@Test(dataProvider = "okFiles")
150149
public void testOkInterleavedFile(final String samFilename) throws IOException {
151150
final File samFile = new File(TEST_DATA_DIR,samFilename);
152151
final File pairFile = newTempFastqFile("pair");
@@ -169,12 +168,11 @@ public void testOkInterleavedFile(final String samFilename) throws IOException {
169168
Assert.assertNotNull(mpair.mate2);
170169
Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
171170
final String readName = mpair.mate1.getReadName() ;
172-
Assert.assertTrue(outputHeaderSet.contains(readName+"/1")); // ensure mate is in correct file
173-
Assert.assertTrue(outputHeaderSet.contains(readName+"/2"));
171+
Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file
172+
Assert.assertTrue(outputHeaderSet.contains(readName + "/2"));
174173
}
175174
}
176175

177-
178176
@Test (dataProvider = "badFiles", expectedExceptions= SAMFormatException.class)
179177
public void testBadFile(final String samFilename) throws IOException {
180178
final File samFile = new File(TEST_DATA_DIR,samFilename);
@@ -196,7 +194,6 @@ public Object[][] okGroupedFiles() {
196194
};
197195
}
198196

199-
200197
@DataProvider(name = "badGroupedFiles")
201198
public Object[][] badGroupedFiles() {
202199
return new Object[][] {
@@ -217,29 +214,22 @@ public void testOkGroupedFiles(final String samFilename, final String [] groupFi
217214
final Map<String, Set<String>> outputSets = new HashMap<>(groupFiles.length);
218215

219216
final String tmpDir = IOUtil.getDefaultTmpDir().getAbsolutePath() + "/";
220-
final String [] args = new String[]{
217+
final String [] args = {
221218
"INPUT=" + samFile.getAbsolutePath(),
222219
"OUTPUT_PER_RG=true",
223220
"OUTPUT_DIR=" + tmpDir,
224221
};
225222
runPicardCommandLine(args);
226223

227-
File f1;
228-
File f2;
229-
String fname1;
230-
String fname2;
231-
String keyName1;
232-
String keyName2;
233224
Set<String> outputHeaderSet1;
234225
Set<String> outputHeaderSet2;
235-
for(final String groupPUName : groupFiles)
236-
{
237-
keyName1 = groupPUName + "_1";
238-
keyName2 = groupPUName + "_2";
239-
fname1 = tmpDir + "/" + keyName1 + ".fastq";
240-
fname2 = tmpDir + "/" + keyName2 + ".fastq";
241-
f1 = new File(fname1);
242-
f2 = new File(fname2);
226+
for (final String groupPUName : groupFiles) {
227+
String keyName1 = groupPUName + "_1";
228+
String keyName2 = groupPUName + "_2";
229+
String fname1 = tmpDir + "/" + keyName1 + ".fastq";
230+
String fname2 = tmpDir + "/" + keyName2 + ".fastq";
231+
File f1 = new File(fname1);
232+
File f2 = new File(fname2);
243233
f1.deleteOnExit();
244234
f2.deleteOnExit();
245235
IOUtil.assertFileIsReadable(f1);
@@ -267,8 +257,8 @@ public void testOkGroupedFiles(final String samFilename, final String [] groupFi
267257
Assert.assertNotNull(mpair.mate2);
268258
Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
269259
final String readName = mpair.mate1.getReadName() ;
270-
Assert.assertTrue(outputHeaderSet1.contains(readName+"/1")); // ensure mate is in correct file
271-
Assert.assertTrue(outputHeaderSet2.contains(readName+"/2"));
260+
Assert.assertTrue(outputHeaderSet1.contains(readName + "/1")); // ensure mate is in correct file
261+
Assert.assertTrue(outputHeaderSet2.contains(readName + "/2"));
272262
}
273263
}
274264
}
@@ -284,7 +274,6 @@ public void testBadGroupedFileOutputPerRg(final String samFilename) throws IOExc
284274

285275
@Test (dataProvider = "badGroupedFiles", expectedExceptions=SAMFormatException.class)
286276
public void testBadGroupedFile(final String samFilename) throws IOException {
287-
final File samFile = new File(TEST_DATA_DIR, samFilename);
288277
final File pair1File = newTempFastqFile("pair1");
289278
final File pair2File = newTempFastqFile("pair2");
290279

@@ -359,12 +348,12 @@ public Object[][] trimmedData() {
359348
};
360349
}
361350

362-
private Set<String> createFastqReadHeaderSet(final File file) {
351+
protected static Set<String> createFastqReadHeaderSet(final File file) {
363352
final Set<String> set = new HashSet<String>();
364353
final FastqReader freader = new FastqReader(file);
365354
while (freader.hasNext()) {
366355
final FastqRecord frec = freader.next();
367-
set.add(frec.getReadHeader());
356+
set.add(frec.getReadName());
368357
}
369358
return set ;
370359
}
@@ -386,19 +375,17 @@ private Map<String,MatePair> createSamMatePairsMap(final File samFile) throws IO
386375
return map;
387376
}
388377

389-
390-
private Map<String, Map<String, MatePair>> createPUPairsMap(final File samFile) throws IOException {
378+
protected static Map<String, Map<String, MatePair>> createPUPairsMap(final File samFile) throws IOException {
391379
IOUtil.assertFileIsReadable(samFile);
392380
final SamReader reader = SamReaderFactory.makeDefault().open(samFile);
393-
final Map<String, Map<String, MatePair>> map = new LinkedHashMap<String, Map<String,MatePair>>();
381+
final Map<String, Map<String, MatePair>> map = new LinkedHashMap<>();
394382

395383
Map<String,MatePair> curFileMap;
396384
for (final SAMRecord record : reader ) {
397385
final String platformUnit = record.getReadGroup().getPlatformUnit();
398386
curFileMap = map.get(platformUnit);
399-
if(curFileMap == null)
400-
{
401-
curFileMap = new LinkedHashMap<String, MatePair>();
387+
if(curFileMap == null) {
388+
curFileMap = new LinkedHashMap<>();
402389
map.put(platformUnit, curFileMap);
403390
}
404391

@@ -430,12 +417,12 @@ private void verifyFastq(final File pair1File, final File pair2File, final File
430417
Assert.assertNotNull(mpair.mate2);
431418
Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
432419
final String readName = mpair.mate1.getReadName() ;
433-
Assert.assertTrue(outputHeaderSet1.contains(readName+"/1")); // ensure mate is in correct file
434-
Assert.assertTrue(outputHeaderSet2.contains(readName+"/2"));
420+
Assert.assertTrue(outputHeaderSet1.contains(readName + "/1")); // ensure mate is in correct file
421+
Assert.assertTrue(outputHeaderSet2.contains(readName + "/2"));
435422
}
436423
}
437424

438-
class MatePair {
425+
static class MatePair {
439426
SAMRecord mate1 ;
440427
SAMRecord mate2 ;
441428
void add(final SAMRecord record) {
@@ -463,7 +450,7 @@ private File newTempFastqFile(final String filename) throws IOException {
463450
return newTempFastqFile(filename, ".fastq");
464451
}
465452

466-
@Test(dataProvider = "okFiles")
453+
@Test(dataProvider = "okFiles")
467454
public void testFileCompression(final String samFilename) throws IOException {
468455
final File samFile = new File(TEST_DATA_DIR,samFilename);
469456
final File pair1File = newTempFastqFile("pair1", ".fastq.gz");
@@ -490,6 +477,4 @@ private void verifyFileIsGzCompressed(final File file) throws IOException {
490477
fis.close();
491478
Assert.assertEquals(observedMagicNumber, expectedMagicNumber);
492479
}
493-
494-
495480
}

0 commit comments

Comments
 (0)