1818
1919*/
2020
21+ // This define is needed for portable definition of PRIu64
22+ #define __STDC_FORMAT_MACROS
23+
2124#include < stdlib.h>
22- #include < stdint .h>
25+ #include < inttypes .h>
2326#include < stdio.h>
2427#include < string.h>
2528#include < errno.h>
26- #include < inttypes.h> // support for printing UINT64
2729#include < map>
2830#include " sbhash.h"
2931
30- #ifndef _GNU_SOURCE
31- // mempcpy is a GNU extension and not available everywhere.
32- void *mempcpy (void *dest, const void *src, size_t n)
33- {
34- return (char *) memcpy (dest, src, n) + n;
35- }
36- #endif
37-
3832// Rename common integer types.
3933// I like having these shorter name.
4034typedef uint64_t UINT64;
4135typedef uint32_t UINT32;
4236
4337// Some helper routines.
38+
39+ // mempcpy is a GNU extension and not available everywhere.
40+ #ifndef _GNU_SOURCE
41+ inline void *mempcpy (void *dest, const void *src, size_t n)
42+ {
43+ return (char *) memcpy (dest, src, n) + n;
44+ }
45+ #endif
46+
4447inline bool streq (char * s1, const char * s2) __attribute__((always_inline));
4548inline bool streq (char * s1, const char * s2)
4649{
@@ -87,7 +90,7 @@ void fprintTimeSeconds (FILE * out, double seconds, int precision)
8790 seconds -= minutes * 60 ;
8891 fprintf (out, " %dM" , minutes);
8992 }
90- if (hours + minutes > 0 )
93+ if (hours + minutes > 0 )
9194 {
9295 fprintf (out, " %.0fS" , seconds);
9396 fprintf (out, " (%.*fS)" , precision, totalseconds);
@@ -107,7 +110,7 @@ inline UINT64 diffTVs (struct timeval * startTV, struct timeval * endTV)
107110#include < sys/times.h>
108111#include < sys/resource.h>
109112#include < time.h>
110- #endif // TIMING
113+ #endif // TIMING
111114
112115// /////////////////////////////////////////////////////////////////////////////
113116// Split Lines
@@ -333,7 +336,7 @@ splitLine_t * readLine(FILE * input)
333336inline void outputString (char * str, FILE * output)
334337{
335338 // Do the error checking here so we don't have to do it elsewhere.
336- if (fputs (str, output) < 0 )
339+ if (fputs (str, output) < 0 )
337340 {
338341 fatalError (" samblaster: Unable to write to output file.\n " );
339342 }
@@ -395,7 +398,7 @@ inline bool isFirstRead(splitLine_t * line) { return checkFlag(line, FIRST_SEG);
395398
396399inline bool isSecondRead (splitLine_t * line) { return checkFlag (line, SECOND_SEG); }
397400
398- inline bool isSecondaryAlignment (splitLine_t * line)
401+ inline bool isSecondaryAlignment (splitLine_t * line)
399402{ return (checkFlag (line, 0x100 ) || checkFlag (line, 0x800 )); }
400403
401404inline bool isDuplicate (splitLine_t * line) { return checkFlag (line, 0x400 ); }
@@ -474,7 +477,7 @@ struct state_struct
474477 char * outputFileName;
475478 FILE * outputFile;
476479 FILE * discordantFile;
477- char * discordantFileName;
480+ char * discordantFileName;
478481 FILE * splitterFile;
479482 char * splitterFileName;
480483 FILE * unmappedClippedFile;
@@ -535,7 +538,7 @@ void deleteState(state_t * s)
535538{
536539 free (s->splitterArray );
537540 if (s->sigs != NULL ) delete[] s->sigs ;
538- for (seqMap_t::iterator iter = s->seqs .begin (); iter != s->seqs .end (); ++iter)
541+ for (seqMap_t::iterator iter = s->seqs .begin (); iter != s->seqs .end (); ++iter)
539542 {
540543 free ((char *)(iter->first ));
541544 }
@@ -562,7 +565,7 @@ inline int calcSigArrOff(splitLine_t * first, splitLine_t * second, seqMap_t & s
562565 int s2 = (second->seqNum * 2 ) + (isReverseStrand (second) ? 1 : 0 );
563566 int retval = (s1 * seqs.size () * 2 ) + s2;
564567#ifdef DEBUG
565- fprintf (stderr, " 1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n " ,
568+ fprintf (stderr, " 1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n " ,
566569 first->seqNum , isReverseStrand (first), s1, second->seqNum , isReverseStrand (second), s2, seqs.size (), retval);
567570#endif
568571 return retval;
@@ -632,7 +635,7 @@ void calcOffsets(splitLine_t * line)
632635 int opLen = parseNextInt (&cigar);
633636 char opCode = parseNextOpCode (&cigar);
634637 if (opCode == ' M' || opCode == ' =' || opCode == ' X' )
635- {
638+ {
636639 line->raLen += opLen;
637640 line->qaLen += opLen;
638641 }
@@ -744,7 +747,7 @@ int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagV
744747void markDupsDiscordants (splitLine_t * block, state_t * state)
745748{
746749 splitLine_t * first = NULL ;
747- splitLine_t * second = NULL ;
750+ splitLine_t * second = NULL ;
748751 int count = 0 ;
749752 for (splitLine_t * line = block; line != NULL ; line = line->next )
750753 {
@@ -905,7 +908,7 @@ int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagV
905908 if (count >= state->splitterArrayMaxSize )
906909 {
907910 state->splitterArrayMaxSize *= 2 ;
908- state->splitterArray = (splitLine_t **)(realloc (state->splitterArray ,
911+ state->splitterArray = (splitLine_t **)(realloc (state->splitterArray ,
909912 state->splitterArrayMaxSize * sizeof (splitLine_t *)));
910913 }
911914 state->splitterArray [count] = line;
@@ -957,11 +960,6 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask,
957960 // Make sure the primary is mapped!
958961 if (isUnmapped (line)) return ;
959962 }
960- else
961- {
962- // We have a secondary, which needs a sequence number.
963- line->seqNum = getSeqNum (line, RNAME, state);
964- }
965963 calcOffsets (line);
966964 }
967965
@@ -985,7 +983,9 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask,
985983
986984 // Now check for the deserts and diagonal difference.
987985 // If they are on different chroms or strands, they pass without the other checks.
988- if ((left->seqNum == right->seqNum ) && (isReverseStrand (left) == isReverseStrand (right)))
986+ // Since we only care if the sequences are the same, we don't need seqNums.
987+ // Instead just compare the strings!
988+ if (streq (left->fields [RNAME], right->fields [RNAME]) && (isReverseStrand (left) == isReverseStrand (right)))
989989 {
990990 // The start and end diags might be different if there is an indel in the alignments.
991991 // So, we use the end on the left, and the start on the right.
@@ -1025,9 +1025,9 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask,
10251025void writeUnmappedClipped (splitLine_t * line, state_t * state)
10261026{
10271027 // Check if we are outputting fasta or fastq.
1028- if (state->unmappedFastq == -1 )
1028+ if (state->unmappedFastq == -1 )
10291029 state->unmappedFastq = (streq (line->fields [QUAL], " *" ) ? 0 : 1 );
1030-
1030+
10311031 // Print the first line.
10321032 char firstChar = (state->unmappedFastq ) ? ' @' : ' >' ;
10331033 if (isPaired (line)) fprintf (state->unmappedClippedFile , " %c%s_%d\n " , firstChar, line->fields [QNAME], (isFirstRead (line) ? 1 : 2 ));
@@ -1099,7 +1099,7 @@ void printVersionString()
10991099
11001100void printUsageString ()
11011101{
1102- const char * useString =
1102+ const char * useString =
11031103 " Author: Greg Faust ([email protected] )\n " 11041104 " Tool to mark duplicates and optionally output split reads and/or discordant pairs.\n "
11051105 " Input sam file must contain paired end data, contain sequence header and be sorted by read ids.\n "
@@ -1130,7 +1130,7 @@ void printUsageString()
11301130 " --minNonOverlap INT Minimum non-overlaping base pairs between two alignments for a read to be included in splitter file. [20]\n "
11311131 " --minClipSize INT Minumum number of bases a mapped read must be clipped to be included in unmapped file. [20]\n "
11321132 " -q --quiet Output fewer statistics.\n " ;
1133-
1133+
11341134 printVersionString ();
11351135 fprintf (stderr, " %s" , useString);
11361136}
@@ -1322,7 +1322,8 @@ int main (int argc, char *argv[])
13221322 {
13231323 if (strncmp (line->fields [i], " SN:" , 3 ) == 0 )
13241324 {
1325- state->seqs [strdup (line->fields [i]+3 )] = count;
1325+ // Unless we are marking dups, we don't need to use sequence numbers.
1326+ if (!state->acceptDups ) state->seqs [strdup (line->fields [i]+3 )] = count;
13261327 count += 1 ;
13271328 }
13281329 }
@@ -1335,7 +1336,7 @@ int main (int argc, char *argv[])
13351336 writeLine (line, state->splitterFile );
13361337 disposeSplitLines (line);
13371338 }
1338-
1339+
13391340 // Make sure we have a header.
13401341 if (count == 1 && !state->acceptDups )
13411342 {
@@ -1393,16 +1394,16 @@ int main (int argc, char *argv[])
13931394 if (state->unmappedClippedFile != NULL )
13941395 fprintf (stderr, " samblaster: Output %" PRIu64" unmapped/clipped reads to %s\n " , unmapClipCount, state->unmappedClippedFileName );
13951396 }
1396-
1397+
13971398 // Output stats.
13981399 if (state->removeDups )
13991400 {
1400- fprintf (stderr, " samblaster: Removed %" PRIu64" of %" PRIu64" (%4.2f%%) read ids as duplicates" ,
1401+ fprintf (stderr, " samblaster: Removed %" PRIu64" of %" PRIu64" (%4.2f%%) read ids as duplicates" ,
14011402 dupCount, idCount, ((double )100 )*dupCount/idCount);
14021403 }
14031404 else
14041405 {
1405- fprintf (stderr, " samblaster: Marked %" PRIu64" of %" PRIu64" (%4.2f%%) read ids as duplicates" ,
1406+ fprintf (stderr, " samblaster: Marked %" PRIu64" of %" PRIu64" (%4.2f%%) read ids as duplicates" ,
14061407 dupCount, idCount, ((double )100 )*dupCount/idCount);
14071408 }
14081409 if ((TIMING == 0 ) || state->quiet )
@@ -1414,7 +1415,7 @@ int main (int argc, char *argv[])
14141415#if TIMING
14151416 getrusage (RUSAGE_SELF, &usagebuf);
14161417 time_t endTime = time (NULL );
1417- struct timeval endRUTime = usagebuf.ru_utime ;
1418+ struct timeval endRUTime = usagebuf.ru_utime ;
14181419 fprintf (stderr, " using %luk memory in " , usagebuf.ru_maxrss );
14191420 fprintTimeMicroSeconds (stderr, diffTVs (&startRUTime, &endRUTime), 3 );
14201421 fprintf (stderr, " CPU seconds and " );
0 commit comments