Skip to content

Commit 76ad6ec

Browse files
committed
This commit of samblaster 0.1.21 includes the following:
1) Fixed bug that resulted in intermittent incorrect detection of split-read mappings when using --acceptDupMarks option. 2) Merged in pull request from Aakrosh Ratan for OSX support, with additional minor mods to make Linux support more portable. 3) Updated SAMBLASTER_Supplemental.pdf to include final samblaster citation, and fixed recently broken link to the file in README.
1 parent 0e16f30 commit 76ad6ec

File tree

3 files changed

+40
-39
lines changed

3 files changed

+40
-39
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
**Please cite:**
77
[Faust, G.G. and Hall, I.M., “*SAMBLASTER*: fast duplicate marking and structural variant read extraction,” *Bioinformatics* Sept. 2014; **30**(17): 2503-2505.](http://bioinformatics.oxfordjournals.org/content/30/17/2503)
88

9-
**Also see:** [SAMBLASTER_Supplemental.pdf] (http://faculty.virginia.edu/irahall/support/samblaster/SAMBLASTER_Supplemental.pdf) for additonal discussion and statistics about the duplicates marked by *samblaster* vs. *Picard* using the NA12878 sample dataset.
9+
**Also see:** [SAMBLASTER_Supplemental.pdf] (https://www.dropbox.com/s/s7vvdtf2gmytvay/SAMBLASTER_Supplemental.pdf?dl=0) for additonal discussion and statistics about the duplicates marked by *samblaster* vs. *Picard* using the NA12878 sample dataset.
1010
Click the preceeding link or download the file from this repository.
1111

1212
---
1313

14-
**Current version:** 0.1.20
14+
**Current version:** 0.1.21
1515

16-
Current support for Linux only.
16+
Support for Linux and OSX.
1717

1818
##Summary
1919
*samblaster* is a fast and flexible program for marking duplicates in __read-id grouped<sup>1</sup>__ paired-end SAM files.

SAMBLASTER_Supplemental.pdf

321 Bytes
Binary file not shown.

samblaster.cpp

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,32 @@
1818
1919
*/
2020

21+
// This define is needed for portable definition of PRIu64
22+
#define __STDC_FORMAT_MACROS
23+
2124
#include <stdlib.h>
22-
#include <stdint.h>
25+
#include <inttypes.h>
2326
#include <stdio.h>
2427
#include <string.h>
2528
#include <errno.h>
26-
#include <inttypes.h> // support for printing UINT64
2729
#include <map>
2830
#include "sbhash.h"
2931

30-
#ifndef _GNU_SOURCE
31-
// mempcpy is a GNU extension and not available everywhere.
32-
void *mempcpy(void *dest, const void *src, size_t n)
33-
{
34-
return (char*) memcpy(dest, src, n) + n;
35-
}
36-
#endif
37-
3832
// Rename common integer types.
3933
// I like having these shorter name.
4034
typedef uint64_t UINT64;
4135
typedef uint32_t UINT32;
4236

4337
// Some helper routines.
38+
39+
// mempcpy is a GNU extension and not available everywhere.
40+
#ifndef _GNU_SOURCE
41+
inline void *mempcpy(void *dest, const void *src, size_t n)
42+
{
43+
return (char*) memcpy(dest, src, n) + n;
44+
}
45+
#endif
46+
4447
inline bool streq(char * s1, const char * s2) __attribute__((always_inline));
4548
inline bool streq(char * s1, const char * s2)
4649
{
@@ -87,7 +90,7 @@ void fprintTimeSeconds (FILE * out, double seconds, int precision)
8790
seconds -= minutes * 60;
8891
fprintf(out, "%dM", minutes);
8992
}
90-
if (hours + minutes > 0)
93+
if (hours + minutes > 0)
9194
{
9295
fprintf(out, "%.0fS", seconds);
9396
fprintf(out, "(%.*fS)", precision, totalseconds);
@@ -107,7 +110,7 @@ inline UINT64 diffTVs (struct timeval * startTV, struct timeval * endTV)
107110
#include <sys/times.h>
108111
#include <sys/resource.h>
109112
#include <time.h>
110-
#endif // TIMING
113+
#endif // TIMING
111114

112115
///////////////////////////////////////////////////////////////////////////////
113116
// Split Lines
@@ -333,7 +336,7 @@ splitLine_t * readLine(FILE * input)
333336
inline void outputString(char * str, FILE * output)
334337
{
335338
// Do the error checking here so we don't have to do it elsewhere.
336-
if (fputs(str, output) < 0)
339+
if (fputs(str, output) < 0)
337340
{
338341
fatalError("samblaster: Unable to write to output file.\n");
339342
}
@@ -395,7 +398,7 @@ inline bool isFirstRead(splitLine_t * line) { return checkFlag(line, FIRST_SEG);
395398

396399
inline bool isSecondRead(splitLine_t * line) { return checkFlag(line, SECOND_SEG); }
397400

398-
inline bool isSecondaryAlignment(splitLine_t * line)
401+
inline bool isSecondaryAlignment(splitLine_t * line)
399402
{ return (checkFlag(line, 0x100) || checkFlag(line, 0x800)); }
400403

401404
inline bool isDuplicate(splitLine_t * line) { return checkFlag(line, 0x400); }
@@ -474,7 +477,7 @@ struct state_struct
474477
char * outputFileName;
475478
FILE * outputFile;
476479
FILE * discordantFile;
477-
char * discordantFileName;
480+
char * discordantFileName;
478481
FILE * splitterFile;
479482
char * splitterFileName;
480483
FILE * unmappedClippedFile;
@@ -535,7 +538,7 @@ void deleteState(state_t * s)
535538
{
536539
free(s->splitterArray);
537540
if (s->sigs != NULL) delete[] s->sigs;
538-
for (seqMap_t::iterator iter = s->seqs.begin(); iter != s->seqs.end(); ++iter)
541+
for (seqMap_t::iterator iter = s->seqs.begin(); iter != s->seqs.end(); ++iter)
539542
{
540543
free((char *)(iter->first));
541544
}
@@ -562,7 +565,7 @@ inline int calcSigArrOff(splitLine_t * first, splitLine_t * second, seqMap_t & s
562565
int s2 = (second->seqNum * 2) + (isReverseStrand(second) ? 1 : 0);
563566
int retval = (s1 * seqs.size() * 2) + s2;
564567
#ifdef DEBUG
565-
fprintf(stderr, "1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n",
568+
fprintf(stderr, "1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n",
566569
first->seqNum, isReverseStrand(first), s1, second->seqNum, isReverseStrand(second), s2, seqs.size(), retval);
567570
#endif
568571
return retval;
@@ -632,7 +635,7 @@ void calcOffsets(splitLine_t * line)
632635
int opLen = parseNextInt(&cigar);
633636
char opCode = parseNextOpCode(&cigar);
634637
if (opCode == 'M' || opCode == '=' || opCode == 'X')
635-
{
638+
{
636639
line->raLen += opLen;
637640
line->qaLen += opLen;
638641
}
@@ -744,7 +747,7 @@ int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagV
744747
void markDupsDiscordants(splitLine_t * block, state_t * state)
745748
{
746749
splitLine_t * first = NULL;
747-
splitLine_t * second = NULL;
750+
splitLine_t * second = NULL;
748751
int count = 0;
749752
for (splitLine_t * line = block; line != NULL; line = line->next)
750753
{
@@ -905,7 +908,7 @@ int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagV
905908
if (count >= state->splitterArrayMaxSize)
906909
{
907910
state->splitterArrayMaxSize *= 2;
908-
state->splitterArray = (splitLine_t **)(realloc(state->splitterArray,
911+
state->splitterArray = (splitLine_t **)(realloc(state->splitterArray,
909912
state->splitterArrayMaxSize * sizeof(splitLine_t *)));
910913
}
911914
state->splitterArray[count] = line;
@@ -957,11 +960,6 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask,
957960
// Make sure the primary is mapped!
958961
if (isUnmapped(line)) return;
959962
}
960-
else
961-
{
962-
// We have a secondary, which needs a sequence number.
963-
line->seqNum = getSeqNum(line, RNAME, state);
964-
}
965963
calcOffsets(line);
966964
}
967965

@@ -985,7 +983,9 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask,
985983

986984
// Now check for the deserts and diagonal difference.
987985
// If they are on different chroms or strands, they pass without the other checks.
988-
if ((left->seqNum == right->seqNum) && (isReverseStrand(left) == isReverseStrand(right)))
986+
// Since we only care if the sequences are the same, we don't need seqNums.
987+
// Instead just compare the strings!
988+
if (streq(left->fields[RNAME], right->fields[RNAME]) && (isReverseStrand(left) == isReverseStrand(right)))
989989
{
990990
// The start and end diags might be different if there is an indel in the alignments.
991991
// So, we use the end on the left, and the start on the right.
@@ -1025,9 +1025,9 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask,
10251025
void writeUnmappedClipped(splitLine_t * line, state_t * state)
10261026
{
10271027
// Check if we are outputting fasta or fastq.
1028-
if (state->unmappedFastq == -1)
1028+
if (state->unmappedFastq == -1)
10291029
state->unmappedFastq = (streq(line->fields[QUAL], "*") ? 0 : 1);
1030-
1030+
10311031
// Print the first line.
10321032
char firstChar = (state->unmappedFastq) ? '@' : '>';
10331033
if (isPaired(line)) fprintf(state->unmappedClippedFile, "%c%s_%d\n", firstChar, line->fields[QNAME], (isFirstRead(line) ? 1 : 2));
@@ -1099,7 +1099,7 @@ void printVersionString()
10991099

11001100
void printUsageString()
11011101
{
1102-
const char* useString =
1102+
const char* useString =
11031103
"Author: Greg Faust ([email protected])\n"
11041104
"Tool to mark duplicates and optionally output split reads and/or discordant pairs.\n"
11051105
"Input sam file must contain paired end data, contain sequence header and be sorted by read ids.\n"
@@ -1130,7 +1130,7 @@ void printUsageString()
11301130
" --minNonOverlap INT Minimum non-overlaping base pairs between two alignments for a read to be included in splitter file. [20]\n"
11311131
" --minClipSize INT Minumum number of bases a mapped read must be clipped to be included in unmapped file. [20]\n"
11321132
"-q --quiet Output fewer statistics.\n";
1133-
1133+
11341134
printVersionString();
11351135
fprintf(stderr, "%s", useString);
11361136
}
@@ -1322,7 +1322,8 @@ int main (int argc, char *argv[])
13221322
{
13231323
if (strncmp(line->fields[i], "SN:", 3) == 0)
13241324
{
1325-
state->seqs[strdup(line->fields[i]+3)] = count;
1325+
// Unless we are marking dups, we don't need to use sequence numbers.
1326+
if (!state->acceptDups) state->seqs[strdup(line->fields[i]+3)] = count;
13261327
count += 1;
13271328
}
13281329
}
@@ -1335,7 +1336,7 @@ int main (int argc, char *argv[])
13351336
writeLine(line, state->splitterFile);
13361337
disposeSplitLines(line);
13371338
}
1338-
1339+
13391340
// Make sure we have a header.
13401341
if (count == 1 && !state->acceptDups)
13411342
{
@@ -1393,16 +1394,16 @@ int main (int argc, char *argv[])
13931394
if (state->unmappedClippedFile != NULL)
13941395
fprintf(stderr, "samblaster: Output %"PRIu64" unmapped/clipped reads to %s\n", unmapClipCount, state->unmappedClippedFileName);
13951396
}
1396-
1397+
13971398
// Output stats.
13981399
if (state->removeDups)
13991400
{
1400-
fprintf(stderr, "samblaster: Removed %"PRIu64" of %"PRIu64" (%4.2f%%) read ids as duplicates",
1401+
fprintf(stderr, "samblaster: Removed %"PRIu64" of %"PRIu64" (%4.2f%%) read ids as duplicates",
14011402
dupCount, idCount, ((double)100)*dupCount/idCount);
14021403
}
14031404
else
14041405
{
1405-
fprintf(stderr, "samblaster: Marked %"PRIu64" of %"PRIu64" (%4.2f%%) read ids as duplicates",
1406+
fprintf(stderr, "samblaster: Marked %"PRIu64" of %"PRIu64" (%4.2f%%) read ids as duplicates",
14061407
dupCount, idCount, ((double)100)*dupCount/idCount);
14071408
}
14081409
if ((TIMING == 0) || state->quiet)
@@ -1414,7 +1415,7 @@ int main (int argc, char *argv[])
14141415
#if TIMING
14151416
getrusage(RUSAGE_SELF, &usagebuf);
14161417
time_t endTime = time(NULL);
1417-
struct timeval endRUTime = usagebuf.ru_utime;
1418+
struct timeval endRUTime = usagebuf.ru_utime;
14181419
fprintf(stderr, " using %luk memory in ", usagebuf.ru_maxrss);
14191420
fprintTimeMicroSeconds(stderr, diffTVs(&startRUTime, &endRUTime), 3);
14201421
fprintf(stderr, " CPU seconds and ");

0 commit comments

Comments
 (0)