Skip to content

Commit 095e376

Browse files
committed
Fix problem with reference contig overflow and underflow due to clipping.
1 parent dadeb37 commit 095e376

File tree

3 files changed

+37
-12
lines changed

3 files changed

+37
-12
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# Determine the samblaster build number
2-
BUILDNUM = 23
2+
BUILDNUM = 24
33
# INTERNAL = TRUE
44

55
OBJS = samblaster.o sbhash.o
66

77
ifdef INTERNAL
88
PROG = samblaster$(BUILDNUM)
9-
CCFLAGS = -Wall -Winline -O3 -g -D BUILDNUM=$(BUILDNUM)
9+
CCFLAGS = -Wall -Winline -O0 -g -D BUILDNUM=$(BUILDNUM)
1010
else
1111
PROG = samblaster
1212
CCFLAGS = -Wall -O3 -D BUILDNUM=$(BUILDNUM)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Click the preceeding link or download the file from this repository.
1111

1212
---
1313

14-
**Current version:** 0.1.23
14+
**Current version:** 0.1.24
1515

1616
Support for Linux and OSX (Version 10.7 or higher).
1717

samblaster.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
// I like having these shorter name.
3434
typedef uint64_t UINT64;
3535
typedef uint32_t UINT32;
36+
typedef int32_t INT32;
3637

3738
// Some helper routines.
3839

@@ -120,8 +121,24 @@ inline UINT64 diffTVs (struct timeval * startTV, struct timeval * endTV)
120121
// They form a singly linked list so that we can form groups of them,
121122
// and also so that we can keep a freelist of them.
122123

124+
// Reference offsets (pos) can be negative or run off the end of a contig due to clipping.
125+
// Therefore, we will use a padding strategy.
126+
// The space allocated to each contig will be padded by twice the max read length.
127+
// This leaves room for both offset underflow and overflow.
128+
// And all offsets will be shifted higher by the max read length.
129+
// This will eliminate negative offsets and "center" offsets within the offset range for the contig.
130+
typedef INT32 pos_t;
131+
#define MAX_SEQUENCE_LENGTH 250 // Current illumina paired-end reads are at most 150 + 150
132+
inline int padLength(int length)
133+
{
134+
return length + (2 * MAX_SEQUENCE_LENGTH);
135+
}
136+
inline int padPos(int pos)
137+
{
138+
return pos + MAX_SEQUENCE_LENGTH;
139+
}
140+
123141
// We need to pre-define these for the SAM specific fields.
124-
typedef UINT32 pos_t; // Type for reference offsets (pos can be negative w/ softclips, but still works fine as unsigned rollover).
125142
typedef UINT64 sgn_t; // Type for signatures for offsets and lengths.
126143
// And the type itself for the next pointer.
127144
typedef struct splitLine splitLine_t;
@@ -476,9 +493,10 @@ inline int str2int (char * str)
476493
return strtol(str, NULL, 0);
477494
}
478495

496+
// Need to change this if pos is unsigned.
479497
inline pos_t str2pos (char * str)
480498
{
481-
return strtoul(str, NULL, 0);
499+
return strtol(str, NULL, 0);
482500
}
483501

484502
// Temp buffer to use to form new flag field when marking dups.
@@ -648,8 +666,8 @@ inline UINT32 calcSigArrOff(splitLine_t * first, splitLine_t * second, int binCo
648666
UINT32 s2 = (second->binNum * 2) + (isReverseStrand(second) ? 1 : 0);
649667
UINT32 retval = (s1 * binCount * 2) + s2;
650668
#ifdef DEBUG
651-
fprintf(stderr, "1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n",
652-
first->binNum, isReverseStrand(first), s1, second->binNum, isReverseStrand(second), s2, binCount, retval);
669+
fprintf(stderr, "1st %d %d -> %d 2nd %d %d -> %d count %d result %d read: %s\n",
670+
first->binNum, isReverseStrand(first), s1, second->binNum, isReverseStrand(second), s2, binCount, retval, first->fields[QNAME]);
653671
#endif
654672
return retval;
655673
}
@@ -754,6 +772,9 @@ void calcOffsets(splitLine_t * line)
754772
line->SQO = line->eclip;
755773
line->EQO = line->eclip + line->qaLen - 1;
756774
}
775+
// Need to pad the pos in case it is negative
776+
line->pos = padPos(line->pos);
777+
// Let's not calculate these again for this line.
757778
line->CIGARprocessed = true;
758779
}
759780

@@ -1451,9 +1472,11 @@ int main (int argc, char *argv[])
14511472
}
14521473

14531474
// Read in the SAM header and create the seqs structure.
1454-
state->seqs[strdup("*")] = 0;
14551475
state->seqLens = (UINT32*)calloc(1, sizeof(UINT32)); //initialize to 0
14561476
state->seqOffs = (UINT64*)calloc(1, sizeof(UINT64)); //initialize to 0
1477+
state->seqs[strdup("*")] = 0;
1478+
state->seqLens[0] = padLength(0);
1479+
state->seqOffs[0] = 0;
14571480
int count = 1;
14581481
UINT64 totalLen = 0;
14591482
splitLine_t * line;
@@ -1479,7 +1502,7 @@ int main (int argc, char *argv[])
14791502
}
14801503
else if (strncmp(line->fields[i], "LN:", 3) == 0)
14811504
{
1482-
seqLen = (UINT32)str2int(line->fields[i]+3);
1505+
seqLen = (UINT32)padLength(str2int(line->fields[i]+3));
14831506
seqOff = totalLen;
14841507
totalLen += (UINT64)(seqLen+1);
14851508
}
@@ -1533,22 +1556,24 @@ int main (int argc, char *argv[])
15331556
if (!state->acceptDups)
15341557
{
15351558
// Make sure we can handle the number of sequences.
1536-
int binCount = (totalLen >> BIN_SHIFT)+1;
1559+
int binCount = (totalLen >> BIN_SHIFT);
15371560
if (binCount >= (1 << 15))
15381561
{
15391562
fatalError("samblaster: Too many sequences in header of input sam file. Exiting.\n");
15401563
}
15411564

15421565
state->binCount = binCount;
1543-
state->sigArraySize = binCount * binCount * 4;
1566+
state->sigArraySize = (binCount * 2 + 1) * (binCount * 2 + 1)+1;
1567+
#ifdef DEBUG
1568+
fprintf(stderr, "samblaster: Signature Array Size = %d\n", state->sigArraySize);
1569+
#endif
15441570
state->sigs = (sigSet_t *) malloc(state->sigArraySize * sizeof(sigSet_t));
15451571
if (state->sigs == NULL) fatalError("samblaster: Unable to allocate signature set array.");
15461572
for (UINT32 i=0; i<state->sigArraySize; i++) hashTableInit(&(state->sigs[i]));
15471573
}
15481574

15491575
// Now start processing the alignment records.
15501576
// line already has the first such line in it unless the file only has a header.
1551-
// Initialize our signature set.
15521577
// Keep a ptr to the end of the current list.
15531578
splitLine_t * last = line;
15541579
count = 1;

0 commit comments

Comments
 (0)