3333// I like having these shorter name.
3434typedef uint64_t UINT64;
3535typedef uint32_t UINT32;
36+ typedef int32_t INT32;
3637
3738// Some helper routines.
3839
@@ -120,8 +121,24 @@ inline UINT64 diffTVs (struct timeval * startTV, struct timeval * endTV)
120121// They form a singly linked list so that we can form groups of them,
121122// and also so that we can keep a freelist of them.
122123
124+ // Reference offsets (pos) can be negative or run off the end of a contig due to clipping.
125+ // Therefore, we will use a padding strategy.
126+ // The space allocated to each contig will be padded by twice the max read length.
127+ // This leaves room for both offset underflow and overflow.
128+ // And all offsets will be shifted higher by the max read length.
129+ // This will eliminate negative offsets and "center" offsets within the offset range for the contig.
130+ typedef INT32 pos_t ;
131+ #define MAX_SEQUENCE_LENGTH 250 // Current illumina paired-end reads are at most 150 + 150
132+ inline int padLength (int length)
133+ {
134+ return length + (2 * MAX_SEQUENCE_LENGTH);
135+ }
136+ inline int padPos (int pos)
137+ {
138+ return pos + MAX_SEQUENCE_LENGTH;
139+ }
140+
123141// We need to pre-define these for the SAM specific fields.
124- typedef UINT32 pos_t ; // Type for reference offsets (pos can be negative w/ softclips, but still works fine as unsigned rollover).
125142typedef UINT64 sgn_t ; // Type for signatures for offsets and lengths.
126143// And the type itself for the next pointer.
127144typedef struct splitLine splitLine_t;
@@ -476,9 +493,10 @@ inline int str2int (char * str)
476493 return strtol (str, NULL , 0 );
477494}
478495
496+ // Need to change this if pos is unsigned.
479497inline pos_t str2pos (char * str)
480498{
481- return strtoul (str, NULL , 0 );
499+ return strtol (str, NULL , 0 );
482500}
483501
484502// Temp buffer to use to form new flag field when marking dups.
@@ -648,8 +666,8 @@ inline UINT32 calcSigArrOff(splitLine_t * first, splitLine_t * second, int binCo
648666 UINT32 s2 = (second->binNum * 2 ) + (isReverseStrand (second) ? 1 : 0 );
649667 UINT32 retval = (s1 * binCount * 2 ) + s2;
650668#ifdef DEBUG
651- fprintf (stderr, " 1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n " ,
652- first->binNum , isReverseStrand (first), s1, second->binNum , isReverseStrand (second), s2, binCount, retval);
669+ fprintf (stderr, " 1st %d %d -> %d 2nd %d %d -> %d count %d result %d read: %s \n " ,
670+ first->binNum , isReverseStrand (first), s1, second->binNum , isReverseStrand (second), s2, binCount, retval, first-> fields [QNAME] );
653671#endif
654672 return retval;
655673}
@@ -754,6 +772,9 @@ void calcOffsets(splitLine_t * line)
754772 line->SQO = line->eclip ;
755773 line->EQO = line->eclip + line->qaLen - 1 ;
756774 }
775+ // Need to pad the pos in case it is negative
776+ line->pos = padPos (line->pos );
777+ // Let's not calculate these again for this line.
757778 line->CIGARprocessed = true ;
758779}
759780
@@ -1451,9 +1472,11 @@ int main (int argc, char *argv[])
14511472 }
14521473
14531474 // Read in the SAM header and create the seqs structure.
1454- state->seqs [strdup (" *" )] = 0 ;
14551475 state->seqLens = (UINT32*)calloc (1 , sizeof (UINT32)); // initialize to 0
14561476 state->seqOffs = (UINT64*)calloc (1 , sizeof (UINT64)); // initialize to 0
1477+ state->seqs [strdup (" *" )] = 0 ;
1478+ state->seqLens [0 ] = padLength (0 );
1479+ state->seqOffs [0 ] = 0 ;
14571480 int count = 1 ;
14581481 UINT64 totalLen = 0 ;
14591482 splitLine_t * line;
@@ -1479,7 +1502,7 @@ int main (int argc, char *argv[])
14791502 }
14801503 else if (strncmp (line->fields [i], " LN:" , 3 ) == 0 )
14811504 {
1482- seqLen = (UINT32)str2int (line->fields [i]+3 );
1505+ seqLen = (UINT32)padLength ( str2int (line->fields [i]+3 ) );
14831506 seqOff = totalLen;
14841507 totalLen += (UINT64)(seqLen+1 );
14851508 }
@@ -1533,22 +1556,24 @@ int main (int argc, char *argv[])
15331556 if (!state->acceptDups )
15341557 {
15351558 // Make sure we can handle the number of sequences.
1536- int binCount = (totalLen >> BIN_SHIFT)+ 1 ;
1559+ int binCount = (totalLen >> BIN_SHIFT);
15371560 if (binCount >= (1 << 15 ))
15381561 {
15391562 fatalError (" samblaster: Too many sequences in header of input sam file. Exiting.\n " );
15401563 }
15411564
15421565 state->binCount = binCount;
1543- state->sigArraySize = binCount * binCount * 4 ;
1566+ state->sigArraySize = (binCount * 2 + 1 ) * (binCount * 2 + 1 )+1 ;
1567+ #ifdef DEBUG
1568+ fprintf (stderr, " samblaster: Signature Array Size = %d\n " , state->sigArraySize );
1569+ #endif
15441570 state->sigs = (sigSet_t *) malloc (state->sigArraySize * sizeof (sigSet_t));
15451571 if (state->sigs == NULL ) fatalError (" samblaster: Unable to allocate signature set array." );
15461572 for (UINT32 i=0 ; i<state->sigArraySize ; i++) hashTableInit (&(state->sigs [i]));
15471573 }
15481574
15491575 // Now start processing the alignment records.
15501576 // line already has the first such line in it unless the file only has a header.
1551- // Initialize our signature set.
15521577 // Keep a ptr to the end of the current list.
15531578 splitLine_t * last = line;
15541579 count = 1 ;
0 commit comments