Skip to content

Commit 0a26d1d

Browse files
authored
Merge pull request #377 from AdamaJava/qmule_ftub
perf(qmule:fastqtosamwithheaders): minor performance inmprovements
2 parents f90a251 + 2ec11a3 commit 0a26d1d

File tree

2 files changed

+83
-80
lines changed

2 files changed

+83
-80
lines changed

qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java

+67-78
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
import java.util.Arrays;
1919
import java.util.List;
2020

21+
import static htsjdk.samtools.util.SequenceUtil.getSamReadNameFromFastqHeader;
22+
2123
/**
2224
* Converts a FASTQ file to an unaligned BAM or SAM file.
2325
* <p>
@@ -205,36 +207,6 @@ public static void main(final String[] argv) {
205207

206208
private static final SolexaQualityConverter solexaQualityConverter = SolexaQualityConverter.getSingleton();
207209

208-
/**
209-
* Looks at fastq input(s) and attempts to determine the proper quality format
210-
*
211-
* Closes the reader(s) by side effect
212-
*
213-
* @param reader1 The first fastq input
214-
* @param reader2 The second fastq input, if necessary. To not use this input, set it to null
215-
* @param expectedQuality If provided, will be used for sanity checking. If left null, autodetection will occur
216-
*/
217-
public static FastqQualityFormat determineQualityFormat(final FastqReader reader1, final FastqReader reader2, final FastqQualityFormat expectedQuality) {
218-
final QualityEncodingDetector detector = new QualityEncodingDetector();
219-
220-
if (reader2 == null) {
221-
detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1);
222-
} else {
223-
detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1, reader2);
224-
reader2.close();
225-
}
226-
227-
reader1.close();
228-
229-
final FastqQualityFormat qualityFormat = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ, expectedQuality);
230-
if (detector.isDeterminationAmbiguous()) {
231-
LOG.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities.");
232-
}
233-
LOG.info(String.format("Auto-detected quality format as: %s.", qualityFormat));
234-
235-
return qualityFormat;
236-
}
237-
238210

239211
/**
240212
* Get a list of FASTQs that are sequentially numbered based on the first (base) fastq.
@@ -362,7 +334,7 @@ protected int doUnpaired(final FastqReader freader, final SAMFileWriter writer)
362334
final ProgressLogger progress = new ProgressLogger(LOG);
363335
for ( ; freader.hasNext() ; readCount++) {
364336
final FastqRecord frec = freader.next();
365-
final String frecName = SequenceUtil.getSamReadNameFromFastqHeader(frec.getReadName());
337+
final String frecName = getSamReadNameFromFastqHeader(frec.getReadName());
366338
final SAMRecord srec = createSamRecord(writer.getFileHeader(), frecName , frec, false) ;
367339
srec.setReadPairedFlag(false);
368340
writer.addAlignment(srec);
@@ -380,8 +352,8 @@ protected int doPaired(final FastqReader freader1, final FastqReader freader2, f
380352
final FastqRecord frec1 = freader1.next();
381353
final FastqRecord frec2 = freader2.next();
382354

383-
final String frec1Name = SequenceUtil.getSamReadNameFromFastqHeader(frec1.getReadName());
384-
final String frec2Name = SequenceUtil.getSamReadNameFromFastqHeader(frec2.getReadName());
355+
final String frec1Name = getSamReadNameFromFastqHeader(frec1.getReadName());
356+
final String frec2Name = getSamReadNameFromFastqHeader(frec2.getReadName());
385357
final String baseName = getBaseName(frec1Name, frec2Name, freader1, freader2);
386358

387359
final SAMRecord srec1 = createSamRecord(writer.getFileHeader(), baseName, frec1, true) ;
@@ -415,8 +387,9 @@ public static SAMRecord createSamRecord(final SAMFileHeader header, final String
415387
srec.setReadString(frec.getReadString());
416388
srec.setReadUnmappedFlag(true);
417389
srec.setAttribute(ReservedTagConstants.READ_GROUP_ID, readGroupName);
418-
String additionalHeader = frec.getReadName().replace(baseName, "");
419-
if (additionalHeader.length() > 0) {
390+
// Optimize additional header handling
391+
String additionalHeader = frec.getReadName().substring(baseName.length());
392+
if ( ! additionalHeader.isEmpty()) {
420393
/*
421394
If this contains the trimmed bases flag (TB:) then put that in a separate tag
422395
*/
@@ -430,13 +403,18 @@ If this contains the trimmed bases flag (TB:) then put that in a separate tag
430403
}
431404
}
432405
}
406+
407+
// Convert and validate quality scores
433408
final byte[] quals = StringUtil.stringToBytes(frec.getBaseQualityString());
434409
convertQuality(quals, fqFormat);
435-
for (final byte qual : quals) {
436-
final int uQual = qual & 0xff;
437-
if (uQual < minQ || uQual > maxQ) {
438-
throw new PicardException("Base quality " + uQual + " is not in the range " + minQ + ".." +
439-
maxQ + " for read " + frec.getReadName());
410+
411+
if (fqFormat != FastqQualityFormat.Standard) {
412+
for (final byte qual : quals) {
413+
final int uQual = qual & 0xff;
414+
if (uQual < minQ || uQual > maxQ) {
415+
throw new PicardException("Base quality " + uQual + " is not in the range " + minQ + ".." +
416+
maxQ + " for read " + frec.getReadName());
417+
}
440418
}
441419
}
442420
srec.setBaseQualities(quals);
@@ -485,7 +463,8 @@ static void convertQuality(final byte[] quals, final FastqQualityFormat version)
485463
}
486464
}
487465

488-
/** Returns read baseName and asserts correct pair read name format:
466+
/**
467+
* Returns the read baseName and asserts correct pair read name format:
489468
* <ul>
490469
* <li> Paired reads must either have the exact same read names or they must contain at least one "/"
491470
* <li> and the First pair read name must end with "/1" and second pair read name ends with "/2"
@@ -494,54 +473,64 @@ static void convertQuality(final byte[] quals, final FastqQualityFormat version)
494473
* </ul>
495474
*/
496475
String getBaseName(final String readName1, final String readName2, final FastqReader freader1, final FastqReader freader2) {
497-
String [] toks = getReadNameTokens(readName1, 1, freader1);
498-
final String baseName1 = toks[0] ;
499-
final String num1 = toks[1] ;
500476

501-
toks = getReadNameTokens(readName2, 2, freader2);
502-
final String baseName2 = toks[0] ;
503-
final String num2 = toks[1];
504-
505-
if (!baseName1.equals(baseName2)) {
506-
throw new PicardException(String.format("In paired mode, read name 1 (%s) does not match read name 2 (%s)", baseName1,baseName2));
507-
}
508-
509-
final boolean num1Blank = StringUtil.isBlank(num1);
510-
final boolean num2Blank = StringUtil.isBlank(num2);
511-
if (num1Blank || num2Blank) {
512-
if(!num1Blank) throw new PicardException(error(freader1,"Pair 1 number is missing (" + readName1 + "). Both pair numbers must be present or neither.")); //num1 != blank and num2 == blank
513-
else if(!num2Blank) throw new PicardException(error(freader2, "Pair 2 number is missing (" + readName2 + "). Both pair numbers must be present or neither.")); //num1 == blank and num =2 != blank
514-
} else {
515-
if (!num1.equals("1")) throw new PicardException(error(freader1,"Pair 1 number must be 1 (" + readName1 + ")"));
516-
if (!num2.equals("2")) throw new PicardException(error(freader2,"Pair 2 number must be 2 (" + readName2 + ")"));
477+
/*
478+
readName1 and readName2 will not end in /1 or /2 as those have been trimmed by the time we are here.
479+
if they are the same, return, otherwise, do the check
480+
*/
481+
if ( ! readName1.isEmpty() && readName1.equals(readName2)) {
482+
return readName1;
517483
}
518484

519-
return baseName1 ;
520-
}
485+
// Retrieve base name and pair number in one operation to avoid multiple traversals
486+
String[] toks1 = breakReadName(readName1, freader1);
487+
String[] toks2 = breakReadName(readName2, freader2);
521488

522-
/** Breaks up read name into baseName and number separated by the last / */
523-
private String [] getReadNameTokens(final String readName, final int pairNum, final FastqReader freader) {
524-
if(readName.equals("")) throw new PicardException(error(freader,"Pair read name " + pairNum + " cannot be empty: "+readName));
489+
if (!toks1[0].equals(toks2[0])) {
490+
throw new PicardException(String.format("In paired mode, read name 1 (%s) does not match read name 2 (%s)", readName1, readName2));
491+
}
525492

526-
final int idx = readName.lastIndexOf('/');
527-
final String[] result = new String[2];
493+
// Check for the most common scenario and exit early if matched
494+
if ("1".equals(toks1[1]) && "2".equals(toks2[1])) {
495+
return toks1[0];
496+
}
528497

529-
if (idx == -1) {
530-
result[0] = readName;
531-
result[1] = null;
532-
} else {
533-
result[1] = readName.substring(idx + 1); // should be a 1 or 2
498+
// Handle the case where either both are null or one is null and the other is not
499+
final String num1 = toks1[1];
500+
final String num2 = toks2[1];
534501

535-
if(!result[1].equals("1") && !result[1].equals("2")) { //if not a 1 or 2 then names must be identical
536-
result[0] = readName;
537-
result[1] = null;
502+
if ((num1 == null) != (num2 == null)) {
503+
if (num1 != null) {
504+
throw new PicardException(error(freader1, "Pair 1 number is missing (" + readName1 + "). Both pair numbers must be present or neither."));
505+
} else {
506+
throw new PicardException(error(freader2, "Pair 2 number is missing (" + readName2 + "). Both pair numbers must be present or neither."));
538507
}
539-
else {
540-
result[0] = readName.substring(0, idx); // baseName
508+
} else if (num1 != null) {
509+
if (!"1".equals(num1)) {
510+
throw new PicardException(error(freader1, "Pair 1 number must be 1 (" + readName1 + ")"));
541511
}
512+
throw new PicardException(error(freader2, "Pair 2 number must be 2 (" + readName2 + ")"));
542513
}
543514

544-
return result ;
515+
return toks1[0];
516+
}
517+
518+
/**
519+
* Breaks read name into baseName and number in one traversal.
520+
* Returns an array containing the baseName (index 0) and number (index 1).
521+
*/
522+
private String[] breakReadName(final String readName, final FastqReader freader) {
523+
if (readName.isEmpty()) {
524+
throw new PicardException(error(freader, "Pair read name cannot be empty: " + readName));
525+
}
526+
final int idx = readName.lastIndexOf('/');
527+
final String[] result = { readName, null };
528+
if (idx != -1) {
529+
final String num = readName.substring(idx + 1);
530+
result[1] = ("1".equals(num) || "2".equals(num)) ? num : null;
531+
result[0] = result[1] != null ? readName.substring(0, idx) : readName;
532+
}
533+
return result;
545534
}
546535

547536
/** Little utility to give error messages corresponding to line numbers in the input files. */

qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java

+16-2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.List;
2222

2323
import static htsjdk.samtools.SAMUtils.MAX_PHRED_SCORE;
24+
import static htsjdk.samtools.util.SequenceUtil.getSamReadNameFromFastqHeader;
2425
import static org.junit.Assert.*;
2526

2627
public class FastqToSamWithHeadersTest {
@@ -116,6 +117,19 @@ public void readPairNameOk() {
116117
assertEquals("/aa", fastqToSam.getBaseName("/aa", "/aa" , freader1, freader2));
117118
assertEquals("aa/", fastqToSam.getBaseName("aa/", "aa/" , freader1, freader2));
118119
assertEquals("ab/c", fastqToSam.getBaseName("ab/c", "ab/c", freader1, freader2));
120+
assertEquals("@ST-E00104:647:HFNMWALXX:4:1101:12357:1063", fastqToSam.getBaseName(getSamReadNameFromFastqHeader("@ST-E00104:647:HFNMWALXX:4:1101:12357:1063 1:N:0:NGGCTATG TB:n+#"), getSamReadNameFromFastqHeader("@ST-E00104:647:HFNMWALXX:4:1101:12357:1063 2:N:0:NGGCTATG TB:nAGA+#--A"), freader1, freader2));
121+
assertEquals("@H3CK2CCXY:2:2222:2121145:0", fastqToSam.getBaseName("@H3CK2CCXY:2:2222:2121145:0", "@H3CK2CCXY:2:2222:2121145:0", freader1, freader2));
122+
}
123+
124+
@Test
125+
public void getSamReadFromFastqHeader() {
126+
assertEquals("@ST-E00104:647:HFNMWALXX:4:1101:12357:1063", getSamReadNameFromFastqHeader("@ST-E00104:647:HFNMWALXX:4:1101:12357:1063 1:N:0:NGGCTATG TB:n+#"));
127+
assertEquals("@H3CK2CCXY:2:2222:2121145:0", getSamReadNameFromFastqHeader("@H3CK2CCXY:2:2222:2121145:0"));
128+
assertEquals("@V350038332L1C001R00400000275", getSamReadNameFromFastqHeader("@V350038332L1C001R00400000275/1"));
129+
assertEquals("@V350038332L1C001R00400000275", getSamReadNameFromFastqHeader("@V350038332L1C001R00400000275/2"));
130+
assertEquals("@V350038332L1C001R00400000275/3", getSamReadNameFromFastqHeader("@V350038332L1C001R00400000275/3"));
131+
assertEquals("@V350038332L1C001R00400000275/3", getSamReadNameFromFastqHeader("@V350038332L1C001R00400000275/3/2/1"));
132+
assertEquals("@V350038332L1C001R00400000275", getSamReadNameFromFastqHeader("@V350038332L1C001R00400000275/2/1"));
119133
}
120134

121135
@Test
@@ -141,11 +155,11 @@ public void readPairNamesBad() {
141155
Assert.fail("Should have thrown an exception");
142156
} catch (PicardException ignored) {}
143157
try {
144-
fastqToSam.getBaseName("aa/1", "aa/1" , freader1, freader2);
158+
fastqToSam.getBaseName("aa/4", "aa/5" , freader1, freader2);
145159
Assert.fail("Should have thrown an exception");
146160
} catch (PicardException ignored) {}
147161
try {
148-
fastqToSam.getBaseName("aa/2", "aa/2" , freader1, freader2);
162+
fastqToSam.getBaseName("aa/2", "aa/1" , freader1, freader2);
149163
Assert.fail("Should have thrown an exception");
150164
} catch (PicardException ignored) {}
151165
}

0 commit comments

Comments
 (0)