Skip to content

Commit 0cadfce

Browse files
committed
Initial version with --addMateTags support.
1 parent c9bbf7b commit 0cadfce

File tree

2 files changed

+69
-13
lines changed

2 files changed

+69
-13
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Determine the samblaster build number
2-
BUILDNUM = 18
2+
BUILDNUM = 19
33
# INTERNAL = TRUE
44

55
OBJS = samblaster.o sbhash.o

samblaster.cpp

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,26 @@ void changeFieldSplitLine(splitLine_t * line, int fnum, char * newValue)
287287
memcpy(fp, newValue, newLen);
288288
}
289289

290+
void addTag(splitLine_t * line, const char * header, const char * val)
291+
{
292+
int hl = strlen(header);
293+
int vl = strlen(val);
294+
// Make sure everything will fit.
295+
char * ptr = line->buffer + line->bufLen - 1;
296+
line->bufLen += hl + vl;
297+
if ((size_t)line->bufLen > line->maxBufLen)
298+
{
299+
fatalError("samblaster: New buffer length exceeds maximum while adding Mate tags.\n");
300+
}
301+
// Copy over the header and the value.
302+
ptr = (char *)mempcpy(ptr, header, hl);
303+
ptr = (char *)mempcpy(ptr, val, vl);
304+
// Add the null terminator for the field, and for the record.
305+
ptr[0] = 0;
306+
ptr[1] = 0;
307+
}
308+
309+
290310
// Read a line from the file and split it.
291311
splitLine_t * readLine(FILE * input)
292312
{
@@ -335,12 +355,16 @@ inline void writeLine(splitLine_t * line, FILE * output)
335355
#define QUAL 10
336356
#define TAGS 11
337357

358+
338359
// Define SAM flag accessors.
360+
#define MULTI_SEGS 0x1
361+
#define FIRST_SEG 0x40
362+
#define SECOND_SEG 0x80
339363
inline bool checkFlag(splitLine_t * line, int bits) { return ((line->flag & bits) != 0); }
340364

341365
inline void setFlag(splitLine_t * line, int bits) { line->flag |= bits; }
342366

343-
inline bool isPaired(splitLine_t * line) { return checkFlag(line, 0x1); }
367+
inline bool isPaired(splitLine_t * line) { return checkFlag(line, MULTI_SEGS); }
344368

345369
inline bool isConcordant(splitLine_t * line) { return checkFlag(line, 0x2); }
346370

@@ -358,9 +382,9 @@ inline bool isReverseStrand(splitLine_t * line) { return checkFlag(line, 0x10);
358382

359383
inline bool isForwardStrand(splitLine_t * line) { return !isReverseStrand(line); }
360384

361-
inline bool isFirstRead(splitLine_t * line) { return checkFlag(line, 0x40); }
385+
inline bool isFirstRead(splitLine_t * line) { return checkFlag(line, FIRST_SEG); }
362386

363-
inline bool isSecondRead(splitLine_t * line) { return checkFlag(line, 0x80); }
387+
inline bool isSecondRead(splitLine_t * line) { return checkFlag(line, SECOND_SEG); }
364388

365389
inline bool isSecondaryAlignment(splitLine_t * line)
366390
{ return (checkFlag(line, 0x100) || checkFlag(line, 0x800)); }
@@ -460,6 +484,7 @@ struct state_struct
460484
bool acceptDups;
461485
bool excludeDups;
462486
bool removeDups;
487+
bool addMateTags;
463488
bool quiet;
464489
};
465490
typedef struct state_struct state_t;
@@ -486,6 +511,7 @@ state_t * makeState ()
486511
s->acceptDups = false;
487512
s->excludeDups = false;
488513
s->removeDups = false;
514+
s->addMateTags = false;
489515
s->quiet = false;
490516
// Start this as -1 to indicate we don't know yet.
491517
// Once we are outputting our first line, we will decide.
@@ -705,6 +731,7 @@ UINT64 splitCount = 0;
705731
UINT64 unmapClipCount = 0;
706732

707733
// This is the main workhorse that determines if lines are dups or not.
734+
int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagValue);
708735
void markDupsDiscordants(splitLine_t * block, state_t * state)
709736
{
710737
splitLine_t * first = NULL;
@@ -748,23 +775,46 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
748775
orphan = true;
749776
dummyFirst = true;
750777
}
751-
// Never mark pairs as dups if both sides are unmapped.
752-
else if (isUnmapped(first) && isUnmapped(second))
753-
{
754-
return;
755-
}
756778
else
757779
{
780+
// Handle the addition of MC and MQ tags if requested.
781+
if (state->addMateTags)
782+
{
783+
int mask = (FIRST_SEG | SECOND_SEG);
784+
// Process the first of the pair.
785+
// Get the list of reads that match the second of the pair.
786+
int count = fillSplitterArray(block, state, second->flag & mask, true);
787+
for (int i=0; i<count; ++i)
788+
{
789+
splitLine_t * line = state->splitterArray[i];
790+
addTag(line, " MC:Z:", first->fields[CIGAR]);
791+
addTag(line, " MQ:i:", first->fields[MAPQ]);
792+
}
793+
// Process the second of the pair.
794+
// Get the list of reads that match the first of the pair.
795+
count = fillSplitterArray(block, state, first->flag & mask, true);
796+
for (int i=0; i<count; ++i)
797+
{
798+
splitLine_t * line = state->splitterArray[i];
799+
addTag(line, " MC:Z:", second->fields[CIGAR]);
800+
addTag(line, " MQ:i:", second->fields[MAPQ]);
801+
}
802+
}
803+
804+
// Never mark pairs as dups if both sides are unmapped.
805+
if (isUnmapped(first) && isUnmapped(second)) return;
806+
758807
// We need to properly handle orphans to get the correct reference offsets and sequence numbers.
759808
orphan = (isUnmapped(first) || isUnmapped(second));
760809
// Orphan that needs to be swapped.
761-
// We need to unmapped one in the first slot so that they won't all collide in the hash table.
810+
// We need the unmapped one in the first slot so that they won't all collide in the hash table.
762811
if (isMapped(first) && isUnmapped(second))
763812
{
764813
swapPtrs(&first, &second);
765814
}
766815
}
767816

817+
// Now look for duplicates.
768818
if (!state->acceptDups)
769819
{
770820
// Calculate and store the second position and sequence name.
@@ -807,6 +857,7 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
807857
}
808858
}
809859
}
860+
810861
// If we have a dummy first, we can't have a discordant pair.
811862
if (dummyFirst)
812863
{
@@ -988,11 +1039,11 @@ void processSAMBlock(splitLine_t * block, state_t * state)
9881039
if (state->splitterFile != NULL || state->unmappedClippedFile != NULL)
9891040
{
9901041
// Check the first read for splitter.
991-
markSplitterUnmappedClipped(block, state, 0x40, true);
1042+
markSplitterUnmappedClipped(block, state, FIRST_SEG, true);
9921043
// Check the second read for splitter.
993-
markSplitterUnmappedClipped(block, state, 0x80, true);
1044+
markSplitterUnmappedClipped(block, state, SECOND_SEG, true);
9941045
// Check for a singleton read
995-
markSplitterUnmappedClipped(block, state, 0x1, false);
1046+
markSplitterUnmappedClipped(block, state, MULTI_SEGS, false);
9961047
}
9971048

9981049
// Now do the output.
@@ -1063,6 +1114,7 @@ void printUsageString()
10631114
"-a --acceptDupMarks Accept duplicate marks already in input file instead of looking for duplicates in the input.\n"
10641115
"-e --excludeDups Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file.\n"
10651116
"-r --removeDups Remove duplicates reads from all output files. (Implies --excludeDups).\n"
1117+
" --addMateTags Add MC and MQ tags to all output paired-end SAM lines.\n"
10661118
" --maxSplitCount INT Maximum number of split alignments for a read to be included in splitter file. [2]\n"
10671119
" --maxUnmappedBases INT Maximum number of un-aligned bases between two alignments to be included in splitter file. [50]\n"
10681120
" --minIndelSize INT Minimum structural variant feature size for split alignments to be included in splitter file. [50]\n"
@@ -1126,6 +1178,10 @@ int main (int argc, char *argv[])
11261178
state->inputFileName = argv[argi];
11271179
if (streq(state->inputFileName, "-")) state->inputFileName = (char *)"stdin";
11281180
}
1181+
else if (streq(argv[argi],"--addMateTags"))
1182+
{
1183+
state->addMateTags = true;
1184+
}
11291185
else if (streq(argv[argi],"--maxSplitCount"))
11301186
{
11311187
argi++;

0 commit comments

Comments
 (0)