@@ -287,6 +287,26 @@ void changeFieldSplitLine(splitLine_t * line, int fnum, char * newValue)
287287 memcpy (fp, newValue, newLen);
288288}
289289
290+ void addTag (splitLine_t * line, const char * header, const char * val)
291+ {
292+ int hl = strlen (header);
293+ int vl = strlen (val);
294+ // Make sure everything will fit.
295+ char * ptr = line->buffer + line->bufLen - 1 ;
296+ line->bufLen += hl + vl;
297+ if ((size_t )line->bufLen > line->maxBufLen )
298+ {
299+ fatalError (" samblaster: New buffer length exceeds maximum while adding Mate tags.\n " );
300+ }
301+ // Copy over the header and the value.
302+ ptr = (char *)mempcpy (ptr, header, hl);
303+ ptr = (char *)mempcpy (ptr, val, vl);
304+ // Add the null terminator for the field, and for the record.
305+ ptr[0 ] = 0 ;
306+ ptr[1 ] = 0 ;
307+ }
308+
309+
290310// Read a line from the file and split it.
291311splitLine_t * readLine (FILE * input)
292312{
@@ -335,12 +355,16 @@ inline void writeLine(splitLine_t * line, FILE * output)
335355#define QUAL 10
336356#define TAGS 11
337357
358+
338359// Define SAM flag accessors.
360+ #define MULTI_SEGS 0x1
361+ #define FIRST_SEG 0x40
362+ #define SECOND_SEG 0x80
339363inline bool checkFlag (splitLine_t * line, int bits) { return ((line->flag & bits) != 0 ); }
340364
341365inline void setFlag (splitLine_t * line, int bits) { line->flag |= bits; }
342366
343- inline bool isPaired (splitLine_t * line) { return checkFlag (line, 0x1 ); }
367+ inline bool isPaired (splitLine_t * line) { return checkFlag (line, MULTI_SEGS ); }
344368
345369inline bool isConcordant (splitLine_t * line) { return checkFlag (line, 0x2 ); }
346370
@@ -358,9 +382,9 @@ inline bool isReverseStrand(splitLine_t * line) { return checkFlag(line, 0x10);
358382
359383inline bool isForwardStrand (splitLine_t * line) { return !isReverseStrand (line); }
360384
361- inline bool isFirstRead (splitLine_t * line) { return checkFlag (line, 0x40 ); }
385+ inline bool isFirstRead (splitLine_t * line) { return checkFlag (line, FIRST_SEG ); }
362386
363- inline bool isSecondRead (splitLine_t * line) { return checkFlag (line, 0x80 ); }
387+ inline bool isSecondRead (splitLine_t * line) { return checkFlag (line, SECOND_SEG ); }
364388
365389inline bool isSecondaryAlignment (splitLine_t * line)
366390{ return (checkFlag (line, 0x100 ) || checkFlag (line, 0x800 )); }
@@ -460,6 +484,7 @@ struct state_struct
460484 bool acceptDups;
461485 bool excludeDups;
462486 bool removeDups;
487+ bool addMateTags;
463488 bool quiet;
464489};
465490typedef struct state_struct state_t ;
@@ -486,6 +511,7 @@ state_t * makeState ()
486511 s->acceptDups = false ;
487512 s->excludeDups = false ;
488513 s->removeDups = false ;
514+ s->addMateTags = false ;
489515 s->quiet = false ;
490516 // Start this as -1 to indicate we don't know yet.
491517 // Once we are outputting our first line, we will decide.
@@ -705,6 +731,7 @@ UINT64 splitCount = 0;
705731UINT64 unmapClipCount = 0 ;
706732
707733// This is the main workhorse that determines if lines are dups or not.
734+ int fillSplitterArray (splitLine_t * block, state_t * state, int mask, bool flagValue);
708735void markDupsDiscordants (splitLine_t * block, state_t * state)
709736{
710737 splitLine_t * first = NULL ;
@@ -748,23 +775,46 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
748775 orphan = true ;
749776 dummyFirst = true ;
750777 }
751- // Never mark pairs as dups if both sides are unmapped.
752- else if (isUnmapped (first) && isUnmapped (second))
753- {
754- return ;
755- }
756778 else
757779 {
780+ // Handle the addition of MC and MQ tags if requested.
781+ if (state->addMateTags )
782+ {
783+ int mask = (FIRST_SEG | SECOND_SEG);
784+ // Process the first of the pair.
785+ // Get the list of reads that match the second of the pair.
786+ int count = fillSplitterArray (block, state, second->flag & mask, true );
787+ for (int i=0 ; i<count; ++i)
788+ {
789+ splitLine_t * line = state->splitterArray [i];
790+ addTag (line, " MC:Z:" , first->fields [CIGAR]);
791+ addTag (line, " MQ:i:" , first->fields [MAPQ]);
792+ }
793+ // Process the second of the pair.
794+ // Get the list of reads that match the first of the pair.
795+ count = fillSplitterArray (block, state, first->flag & mask, true );
796+ for (int i=0 ; i<count; ++i)
797+ {
798+ splitLine_t * line = state->splitterArray [i];
799+ addTag (line, " MC:Z:" , second->fields [CIGAR]);
800+ addTag (line, " MQ:i:" , second->fields [MAPQ]);
801+ }
802+ }
803+
804+ // Never mark pairs as dups if both sides are unmapped.
805+ if (isUnmapped (first) && isUnmapped (second)) return ;
806+
758807 // We need to properly handle orphans to get the correct reference offsets and sequence numbers.
759808 orphan = (isUnmapped (first) || isUnmapped (second));
760809 // Orphan that needs to be swapped.
761- // We need to unmapped one in the first slot so that they won't all collide in the hash table.
810+ // We need the unmapped one in the first slot so that they won't all collide in the hash table.
762811 if (isMapped (first) && isUnmapped (second))
763812 {
764813 swapPtrs (&first, &second);
765814 }
766815 }
767816
817+ // Now look for duplicates.
768818 if (!state->acceptDups )
769819 {
770820 // Calculate and store the second position and sequence name.
@@ -807,6 +857,7 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
807857 }
808858 }
809859 }
860+
810861 // If we have a dummy first, we can't have a discordant pair.
811862 if (dummyFirst)
812863 {
@@ -988,11 +1039,11 @@ void processSAMBlock(splitLine_t * block, state_t * state)
9881039 if (state->splitterFile != NULL || state->unmappedClippedFile != NULL )
9891040 {
9901041 // Check the first read for splitter.
991- markSplitterUnmappedClipped (block, state, 0x40 , true );
1042+ markSplitterUnmappedClipped (block, state, FIRST_SEG , true );
9921043 // Check the second read for splitter.
993- markSplitterUnmappedClipped (block, state, 0x80 , true );
1044+ markSplitterUnmappedClipped (block, state, SECOND_SEG , true );
9941045 // Check for a singleton read
995- markSplitterUnmappedClipped (block, state, 0x1 , false );
1046+ markSplitterUnmappedClipped (block, state, MULTI_SEGS , false );
9961047 }
9971048
9981049 // Now do the output.
@@ -1063,6 +1114,7 @@ void printUsageString()
10631114 " -a --acceptDupMarks Accept duplicate marks already in input file instead of looking for duplicates in the input.\n "
10641115 " -e --excludeDups Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file.\n "
10651116 " -r --removeDups Remove duplicates reads from all output files. (Implies --excludeDups).\n "
1117+ " --addMateTags Add MC and MQ tags to all output paired-end SAM lines.\n "
10661118 " --maxSplitCount INT Maximum number of split alignments for a read to be included in splitter file. [2]\n "
10671119 " --maxUnmappedBases INT Maximum number of un-aligned bases between two alignments to be included in splitter file. [50]\n "
10681120 " --minIndelSize INT Minimum structural variant feature size for split alignments to be included in splitter file. [50]\n "
@@ -1126,6 +1178,10 @@ int main (int argc, char *argv[])
11261178 state->inputFileName = argv[argi];
11271179 if (streq (state->inputFileName , " -" )) state->inputFileName = (char *)" stdin" ;
11281180 }
1181+ else if (streq (argv[argi]," --addMateTags" ))
1182+ {
1183+ state->addMateTags = true ;
1184+ }
11291185 else if (streq (argv[argi]," --maxSplitCount" ))
11301186 {
11311187 argi++;
0 commit comments