diff --git a/SNAPLib/AffineGapVectorized.cpp b/SNAPLib/AffineGapVectorized.cpp index f33ba892..9338b8af 100644 --- a/SNAPLib/AffineGapVectorized.cpp +++ b/SNAPLib/AffineGapVectorized.cpp @@ -60,6 +60,33 @@ AffineGapVectorizedWithCigar::AffineGapVectorizedWithCigar() : } } +void +AffineGapVectorizedWithCigar::init( + int i_matchReward, + int i_subPenalty, + int i_gapOpenPenalty, + int i_gapExtendPenalty) +{ + matchReward = i_matchReward; + subPenalty = -i_subPenalty; + gapOpenPenalty = i_gapOpenPenalty + i_gapExtendPenalty; + gapExtendPenalty = i_gapExtendPenalty; + + // + // Initialize nucleotide <-> nucleotide transition matrix + // + int i, j, k = 0; + for (i = 0; i < (MAX_ALPHABET_SIZE - 1); i++) { + for (j = 0; j < (MAX_ALPHABET_SIZE - 1); j++) { + ntTransitionMatrix[k++] = (i == j) ? matchReward : subPenalty; + } + ntTransitionMatrix[k++] = -1; // FIXME: What penalty to use for N ? + } + for (i = 0; i < MAX_ALPHABET_SIZE; i++) { + ntTransitionMatrix[k++] = -1; + } +} + /*++ Write cigar to buffer, return true if it fits null-terminates buffer if it returns false (i.e. fills up buffer) @@ -170,8 +197,7 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL if (k < patternLen) { uint8_t bp = BASE_VALUE[pattern[k]]; queryResult[patternIdx] = ntTransitionMatrix[i * MAX_ALPHABET_SIZE + bp]; - } - else { + } else { queryResult[patternIdx] = INT16_MIN; } patternIdx++; @@ -375,20 +401,18 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL rowIdx--; colIdx--; matrixIdx = 0; - } - else if (action == D) { + } else if (action == D) { rowIdx--; matrixIdx = 1; - } - else { + } else { colIdx--; action = I; matrixIdx = 2; } + if (prevAction == action) { actionCount++; - } - else { + } else { if (prevAction != X) { res.action[n_res] = prevAction; res.count[n_res] = actionCount; @@ -398,18 +422,21 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL } prevAction = action; } + // We went past either pattern or text. Dump out any action counts that we have been tracking if (prevAction == action) { res.action[n_res] = prevAction; res.count[n_res] = actionCount; n_res++; } + if (rowIdx >= 0) { actionCount = rowIdx + 1; res.action[n_res] = D; res.count[n_res] = actionCount; n_res++; } + if (colIdx >= 0) { actionCount = colIdx + 1; res.action[n_res] = I; @@ -434,11 +461,9 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL if (res.action[i] == M) { rowIdx += res.count[i]; colIdx += res.count[i]; - } - else if (res.action[i] == D) { + } else if (res.action[i] == D) { rowIdx += res.count[i]; - } - else { + } else { if (i > 0 && rowIdx < textUsed && colIdx < patternLen - 1) { if ((pattern[colIdx + 1] == pattern[colIdx]) && (pattern[colIdx + 1] != text[rowIdx]) && (quality[colIdx] < 65)) { // only insert high quality bases if possible if ((i + 1 <= n_res - 1) && res.action[i + 1] == M && res.count[i - 1] > 1) { @@ -446,6 +471,7 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL rowIdx++; colIdx++; } + if (res.action[i - 1] == M && res.count[i - 1] > 1) { res.count[i - 1] -= 1; } @@ -461,11 +487,9 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL if (res.action[i] == M) { rowIdx += res.count[i]; colIdx += res.count[i]; - } - else if (res.action[i] == D) { + } else if (res.action[i] == D) { rowIdx += res.count[i]; - } - else { + } else { if (i > 0 && rowIdx + 1 < textUsed && colIdx + res.count[i] < patternLen - 1) { if ((pattern[colIdx + res.count[i]] == pattern[colIdx]) && (pattern[colIdx + res.count[i] + 1] != text[rowIdx + 1]) && (quality[colIdx] < 65)) { // only insert high quality bases (>= 65) if possible if ((i + 1 <= n_res - 1) && res.action[i + 1] == M && res.count[i - 1] > 2) { @@ -473,6 +497,7 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL rowIdx += 2; colIdx += 2; } + if (res.action[i - 1] == M && res.count[i - 1] > 2) { res.count[i - 1] -= 2; } @@ -482,86 +507,7 @@ int AffineGapVectorizedWithCigar::computeGlobalScore(const char* text, int textL } } - rowIdx = 0; colIdx = 0; - for (int i = n_res - 1; i >= min_i; --i) { // Reverse local result to obtain CIGAR in correct order - if (useM) { - // Compute edit distance NM:i flag - if (res.action[i] == M) { - int j = 0, countM = 0; - for (int j = 0; j < res.count[i]; ++j) { - if (text[rowIdx + j] != pattern[colIdx + j]) { - nEdits++; - } - } - rowIdx += res.count[i]; - colIdx += res.count[i]; - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], 'M', format)) { - return -2; - } - } - else { - if (res.action[i] == D) { - rowIdx += res.count[i]; - *o_netDel += res.count[i]; - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], 'D', format)) { - return -2; - } - } - else if (res.action[i] == I) { - colIdx += res.count[i]; - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], 'I', format)) { - return -2; - } - } - nEdits += res.count[i]; - } - } - else { - if (res.action[i] == M) { - int j = 0, countM = 0; - for (int j = 0; j < res.count[i]; ++j) { - if (text[rowIdx + j] != pattern[colIdx + j]) { - // Write the matches '=' - if (countM > 0) { - if (!writeCigar(&cigarBuf, &cigarBufLen, countM, '=', format)) { - return -2; - } - } - // Write the mismatching 'X' - if (!writeCigar(&cigarBuf, &cigarBufLen, 1, 'X', format)) { - return -2; - } - countM = 0; - } - else { - countM++; - } - } - rowIdx += res.count[i]; - colIdx += res.count[i]; - } - else { - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], res.action[i], format)) { - return -2; - } - if (res.action[i] == D) { - rowIdx += res.count[i]; - } - else if (res.action[i] == I) { - colIdx += res.count[i]; - } - } - } - } - - if (format != BAM_CIGAR_OPS) { - *(cigarBuf - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string - } - if (o_cigarBufUsed != NULL) { - *o_cigarBufUsed = (int)(cigarBuf - cigarBufStart); - } - // nEdits = (nEdits <= w) ? nEdits : -1; // return -1 if we have more edits than threshold w - return nEdits; + return computeFinalCigarString(text, textLen, pattern, patternLen, n_res, min_i, cigarBufStart, &cigarBuf, cigarBufLen, useM, format, o_cigarBufUsed, o_netDel); } // score > 0 else { @@ -632,15 +578,14 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreBanded( if (idx < patternLen) { uint8_t bp = BASE_VALUE[pattern[idx]]; queryResult[patternIdx] = ntTransitionMatrix[a * MAX_ALPHABET_SIZE + bp]; - } - else { + } else { queryResult[patternIdx] = INT16_MIN; } patternIdx++; - } - } - } - } + } // k + } // j + } // i + } // a // // Define constants in their vector form @@ -715,12 +660,10 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreBanded( hInit = scoreInit - (gapOpenPenalty + (i - 1) * gapExtendPenalty); } v_hInit = _mm_set1_epi16(hInit); - } - else { + } else { if (bandBeg > j * segLen) { v_hInit = v_zero; - } - else { + } else { v_hInit = _mm_load_si128(Hptr + j * numVec - 1); v_hInit = _mm_srli_si128(v_hInit, 14); } @@ -885,20 +828,18 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreBanded( rowIdx--; colIdx--; matrixIdx = 0; - } - else if (action == D) { + } else if (action == D) { rowIdx--; matrixIdx = 1; - } - else { + } else { colIdx--; action = I; matrixIdx = 2; } + if (prevAction == action) { actionCount++; - } - else { + } else { if (prevAction != X) { res.action[n_res] = prevAction; res.count[n_res] = actionCount; @@ -908,18 +849,21 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreBanded( } prevAction = action; } + // We went past either pattern or text. Dump out any action counts that we have been tracking if (prevAction == action) { res.action[n_res] = prevAction; res.count[n_res] = actionCount; n_res++; } + if (rowIdx >= 0) { actionCount = rowIdx + 1; res.action[n_res] = D; res.count[n_res] = actionCount; n_res++; } + if (colIdx >= 0) { actionCount = colIdx + 1; res.action[n_res] = I; @@ -944,11 +888,9 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreBanded( if (res.action[i] == M) { rowIdx += res.count[i]; colIdx += res.count[i]; - } - else if (res.action[i] == D) { + } else if (res.action[i] == D) { rowIdx += res.count[i]; - } - else { + } else { if (i > 0 && rowIdx < textUsed && colIdx < patternLen - 1) { if ((pattern[colIdx + 1] == pattern[colIdx]) && (pattern[colIdx + 1] != text[rowIdx]) && (quality[colIdx] < 65)) { // only insert high quality bases (>= 65) if possible if ((i + 1 <= n_res - 1) && res.action[i + 1] == M && res.count[i - 1] > 1) { @@ -971,11 +913,9 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreBanded( if (res.action[i] == M) { rowIdx += res.count[i]; colIdx += res.count[i]; - } - else if (res.action[i] == D) { + } else if (res.action[i] == D) { rowIdx += res.count[i]; - } - else { + } else { if (i > 0 && rowIdx + 1 < textUsed && colIdx + res.count[i] < patternLen - 1) { if ((pattern[colIdx + res.count[i]] == pattern[colIdx]) && (pattern[colIdx + res.count[i] + 1] != text[rowIdx + 1]) && (quality[colIdx] < 65)) { // only insert high quality bases (>= 65) if possible if ((i + 1 <= n_res - 1) && res.action[i + 1] == M && res.count[i - 1] > 2) { @@ -992,120 +932,149 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreBanded( } } - rowIdx = 0; colIdx = 0; - for (int i = n_res - 1; i >= min_i; --i) { // Reverse local result to obtain CIGAR in correct order + return computeFinalCigarString(text, textLen, pattern, patternLen, n_res, min_i, cigarBufStart, &cigarBuf, cigarBufLen, useM, format, o_cigarBufUsed, o_netDel); + + } // score > scoreInit + else { + // Could not align strings with at most K edits + *(cigarBuf - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string + return -1; + } +} // AffineGapVectorizedWithCigar::computeGlobalScoreBanded + +int AffineGapVectorizedWithCigar::computeFinalCigarString( + const char* text, + int textLen, + const char* pattern, + int patternLen, + int n_res, + int min_i, + const char* cigarBufStart, + char** cigarBufPtr, + int cigarBufLen, + bool useM, + CigarFormat format, + int* o_cigarBufUsed, + int* o_netDel + ) +{ + int nEdits = 0, rowIdx = 0, colIdx = 0; + + for (int i = n_res - 1; i >= min_i; --i) { // Reverse local result to obtain CIGAR in correct order + if (res.action[i] == D) { + rowIdx += res.count[i]; + *o_netDel += res.count[i]; + nEdits += res.count[i]; + if (!writeCigar(cigarBufPtr, &cigarBufLen, res.count[i], 'D', format)) { + return -2; + } + } else if (res.action[i] == I) { + colIdx += res.count[i]; + nEdits += res.count[i]; + if (!writeCigar(cigarBufPtr, &cigarBufLen, res.count[i], 'I', format)) { + return -2; + } + } else if (res.action[i] == M) { if (useM) { - // Compute edit distance NM:i flag - if (res.action[i] == M) { - int j = 0, countM = 0; - for (int j = 0; j < res.count[i]; ++j) { - if (text[rowIdx + j] != pattern[colIdx + j]) { - nEdits++; - } - } - rowIdx += res.count[i]; - colIdx += res.count[i]; - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], 'M', format)) { - return -2; + for (int j = 0; j < res.count[i]; ++j) { + if (text[rowIdx + j] != pattern[colIdx + j]) { + nEdits++; } } - else { - if (res.action[i] == D) { - rowIdx += res.count[i]; - *o_netDel += res.count[i]; - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], 'D', format)) { - return -2; - } - } - else if (res.action[i] == I) { - colIdx += res.count[i]; - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], 'I', format)) { - return -2; - } - } - nEdits += res.count[i]; + + if (!writeCigar(cigarBufPtr, &cigarBufLen, res.count[i], 'M', format)) { + return -2; } - } - else { - if (res.action[i] == M) { - int j = 0, countM = 0; - for (int j = 0; j < res.count[i]; ++j) { + } else { + // not useM (i.e., = and X CIGAR types) + int currentRunSize = 1; + bool currentRunIsX = text[rowIdx] != pattern[colIdx]; + nEdits = currentRunIsX ? nEdits + 1 : nEdits; + + for (int j = 1; j < res.count[i]; j++) { + if ((text[rowIdx + j] != pattern[colIdx + j]) == currentRunIsX) { + // + // We're continuing the run of matching or not matching. + // + currentRunSize++; if (text[rowIdx + j] != pattern[colIdx + j]) { - // Write the matches '=' - if (countM > 0) { - if (!writeCigar(&cigarBuf, &cigarBufLen, countM, '=', format)) { - return -2; - } - } - // Write the mismatching 'X' - if (!writeCigar(&cigarBuf, &cigarBufLen, 1, 'X', format)) { - return -2; - } - countM = 0; + nEdits++; } - else { - countM++; + } else { + // + // We switched between matching and not matching. Write the CIGAR element + // and start a new run. + // + if (!writeCigar(cigarBufPtr, &cigarBufLen, currentRunSize, currentRunIsX ? 'X' : '=', format)) { + return -2; } - } - rowIdx += res.count[i]; - colIdx += res.count[i]; - } - else { - if (!writeCigar(&cigarBuf, &cigarBufLen, res.count[i], res.action[i], format)) { - return -2; - } - if (res.action[i] == D) { - rowIdx += res.count[i]; - } - else if (res.action[i] == I) { - colIdx += res.count[i]; - } + currentRunSize = 1; + currentRunIsX = !currentRunIsX; + nEdits = currentRunIsX ? nEdits + 1 : nEdits; + } // ending a run + } // for each base in the M section + + // + // Now write out the final chunk. + // + if (!writeCigar(cigarBufPtr, &cigarBufLen, currentRunSize, currentRunIsX ? 'X' : '=', format)) { + return -2; } - } - } + } // not useM - if (format != BAM_CIGAR_OPS) { - *(cigarBuf - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string - } - if (o_cigarBufUsed != NULL) { - *o_cigarBufUsed = (int)(cigarBuf - cigarBufStart); - } - // nEdits = (nEdits <= w) ? nEdits : -1; // return -1 if we have more edits than threshold w - return nEdits; + rowIdx += res.count[i]; + colIdx += res.count[i]; + } // it was an M - } // score > scoreInit - else { - // Could not align strings with at most K edits - *(cigarBuf - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string - return -1; + } // for each raw CIGAR op + + if (format != BAM_CIGAR_OPS) { + *(*cigarBufPtr - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string + } + + if (o_cigarBufUsed != NULL) { + *o_cigarBufUsed = (int)(*cigarBufPtr - cigarBufStart); } + + + return nEdits; } int AffineGapVectorizedWithCigar::computeGlobalScoreNormalized(const char* text, int textLen, - const char* pattern, const char* quality, int patternLen, - int k, - char *cigarBuf, int cigarBufLen, bool useM, - CigarFormat format, int* o_cigarBufUsed, - int* o_addFrontClipping, int *o_netDel, int *o_tailIns) + const char* pattern, + const char* quality, + int patternLen, + int k, + char * cigarBuf, + int cigarBufLen, + bool useM, + CigarFormat format, + int* o_cigarBufUsed, + int* o_addFrontClipping, + int * o_netDel, + int * o_tailIns + ) { if (format != BAM_CIGAR_OPS && format != COMPACT_CIGAR_STRING) { WriteErrorMessage("AffineGapWithCigar::computeGlobalScoreNormalized invalid parameter\n"); soft_exit(1); } + int bamBufLen = (format == BAM_CIGAR_OPS ? 1 : 2) * cigarBufLen; // should be enough char* bamBuf = (char*)alloca(bamBufLen); int bamBufUsed; int score; + if (patternLen >= (3 * (2 * k + 1))) { score = computeGlobalScoreBanded(text, (int)textLen, pattern, quality, (int)patternLen, k, MAX_READ_LENGTH, bamBuf, bamBufLen, - useM, BAM_CIGAR_OPS, &bamBufUsed, o_netDel, o_tailIns); + useM, BAM_CIGAR_OPS, &bamBufUsed, o_netDel, o_tailIns); if (score < 0 || score > k) { score = computeGlobalScore(text, (int)textLen, pattern, quality, (int)patternLen, k, bamBuf, bamBufLen, - useM, BAM_CIGAR_OPS, &bamBufUsed, o_netDel, o_tailIns); + useM, BAM_CIGAR_OPS, &bamBufUsed, o_netDel, o_tailIns); } } else { score = computeGlobalScore(text, (int)textLen, pattern, quality, (int)patternLen, k, bamBuf, bamBufLen, - useM, BAM_CIGAR_OPS, &bamBufUsed, o_netDel, o_tailIns); + useM, BAM_CIGAR_OPS, &bamBufUsed, o_netDel, o_tailIns); } if (score < 0) { @@ -1125,11 +1094,9 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreNormalized(const char* text, if (*o_addFrontClipping != 0) { return 0; // can fail, will be rerun with new clipping } - } - else if (firstCode == 'I') { + } else if (firstCode == 'I') { *o_addFrontClipping = -1 * BAMAlignment::GetCigarOpCount(bamOps[0]); - } - else { + } else { *o_addFrontClipping = 0; } } @@ -1146,8 +1113,7 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreNormalized(const char* text, if (o_cigarBufUsed != NULL) { *o_cigarBufUsed = bamBufUsed; } - } - else { + } else { bool ok = BAMAlignment::decodeCigar(cigarBuf, cigarBufLen, bamOps, bamOpCount); if (!ok) { return -1; @@ -1157,4 +1123,4 @@ int AffineGapVectorizedWithCigar::computeGlobalScoreNormalized(const char* text, } } return score; -} \ No newline at end of file +} diff --git a/SNAPLib/AffineGapVectorized.h b/SNAPLib/AffineGapVectorized.h index 45bd0d13..e6a04096 100644 --- a/SNAPLib/AffineGapVectorized.h +++ b/SNAPLib/AffineGapVectorized.h @@ -1378,10 +1378,12 @@ template class AffineGapVectorized { class AffineGapVectorizedWithCigar { public: - AffineGapVectorizedWithCigar(); // FIXME: Pass scoring parameters + AffineGapVectorizedWithCigar(); AffineGapVectorizedWithCigar(int i_matchReward, int i_subPenalty, int i_gapOpenPenalty, int i_gapExtendPenalty); + void init(int i_matchReward, int i_subPenalty, int i_gapOpenPenalty, int i_gapExtendPenalty); + // Compute the affine gap score between two strings and write the CIGAR string in cigarBuf. // Returns -1 if the edit distance exceeds k or -2 if we run out of space in cigarBuf. int computeGlobalScore(const char* text, int textLen, const char* pattern, const char* quality, int patternLen, int w, @@ -1453,4 +1455,20 @@ class AffineGapVectorizedWithCigar { LocalCigarResult res; + int computeFinalCigarString( + const char* text, + int textLen, + const char* pattern, + int patternLen, + int n_res, + int min_i, + const char *cigarBufStart, + char** cigarBuf, + int cigarBufLen, + bool useM, + CigarFormat format, + int* o_cigarBufUsed, + int* o_netDel + ); + }; diff --git a/SNAPLib/AlignerOptions.cpp b/SNAPLib/AlignerOptions.cpp index 57e1cc44..d021dccd 100644 --- a/SNAPLib/AlignerOptions.cpp +++ b/SNAPLib/AlignerOptions.cpp @@ -231,13 +231,14 @@ AlignerOptions::usage() " are the same as the NM values, except that they contain penalties for soft clipping reads that hang over the end of contigs (but not for\n" " soft clipping that's due to # quality scores or that was present in the input SAM/BAM file and retained due to -pc)\n" " -G- disable affine gap scoring (default: true)\n" - " Scoring parameters (works only when -G- is not used)\n" - " cost for match -gm (default: %u)\n" - " cost for substitution -gs (default: %u)\n" - " cost for opening a gap -go (default: %u)\n" - " cost for extending a gap -ge (default: %u)\n" - " bonus for alignment reaching 5' end of read -g5 (default: %u)\n" - " bonus for alignment reaching 3' end of read -g3 (default: %u)\n" + " Affine gap scoring parameters (works only when -G- is not used):\n" + " -gm cost for match (default: %u)\n" + " -gs cost for substitution (default: %u)\n" + " -go cost for opening a gap (default: %u)\n" + " -ge cost for extending a gap (default: %u)\n" + " -g5 bonus for alignment reaching 5' end of read (default: %u)\n" + " -g3 bonus for alignment reaching 3' end of read (default: %u)\n" + "\n" " -A- Disable ALT awareness. The default is to try to map reads to the primary assembly and only to choose ALT alignments when they're much better,\n" " and to compute MAPQ for non-ALT alignments using only non-ALT hits. This flag disables that behavior and results in ALT-oblivious behavior.\n" " -ea Emit ALT alignments. When the aligner is ALT aware (i.e., -A- isn't specified) if it finds an ALT alignment that would have been\n" diff --git a/SNAPLib/Bam.cpp b/SNAPLib/Bam.cpp index 1a780205..22569884 100644 --- a/SNAPLib/Bam.cpp +++ b/SNAPLib/Bam.cpp @@ -951,7 +951,7 @@ BAMFormat::getWriterSupplier( dataSupplier = DataWriterSupplier::create(options->outputFile.fileName, options->writeBufferSize, options->emitInternalScore, options->internalScoreTag, gzipSupplier); } - return ReadWriterSupplier::create(this, dataSupplier, genome, options->killIfTooSlow, options->emitInternalScore, options->internalScoreTag, options->ignoreAlignmentAdjustmentsForOm); + return ReadWriterSupplier::create(this, dataSupplier, genome, options->killIfTooSlow, options->emitInternalScore, options->internalScoreTag, options->ignoreAlignmentAdjustmentsForOm, options->matchReward, options->subPenalty, options->gapOpenPenalty, options->gapExtendPenalty); } bool diff --git a/SNAPLib/CommandProcessor.cpp b/SNAPLib/CommandProcessor.cpp index e7da52e8..48044e92 100644 --- a/SNAPLib/CommandProcessor.cpp +++ b/SNAPLib/CommandProcessor.cpp @@ -35,7 +35,7 @@ Revision History: #include "Error.h" #include "Compat.h" -const char *SNAP_VERSION = "2.0.0"; +const char *SNAP_VERSION = "2.0.1"; static void usage() { diff --git a/SNAPLib/DataReader.cpp b/SNAPLib/DataReader.cpp index 23aee918..2c6f00cc 100644 --- a/SNAPLib/DataReader.cpp +++ b/SNAPLib/DataReader.cpp @@ -2006,7 +2006,7 @@ DecompressDataReader::decompress( if (status < 0 && status != Z_BUF_ERROR) { WriteErrorMessage("GzipDataReader: inflate failed with %d\n", status); if (Z_DATA_ERROR == status) { - fprintf(stderr, "%d is Z_DATA_ERROR, indicating a corrupt input. zstream->availIn is 0x%x:", status, zstream->avail_in); + fprintf(stderr, "%d is Z_DATA_ERROR, indicating a corrupt input. Are you *sure* this is a gzipped file and it's not just plain text or from some other compressor like bzip? zstream->availIn is 0x%x:", status, zstream->avail_in); for (_int64 offset = 0; offset < oldAvailIn; offset++) { if (offset % 32 == 0) { fprintf(stderr, "\n0x%llx\t", offset); @@ -2019,7 +2019,7 @@ DecompressDataReader::decompress( return false; } if (status < 0 && zstream->avail_out == 0 && zstream->avail_in > 0) { - WriteErrorMessage("insufficient decompression buffer space - increase expansion factor, currently -xf %.1f\n", DataSupplier::ExpansionFactor); + WriteErrorMessage("insufficient decompression buffer space - increase expansion factor, currently -xf %.1f (if you're indexing, please decompress the FASTA/ALT files externally and use the uncompressed version here)\n", DataSupplier::ExpansionFactor); return false; } @@ -2295,7 +2295,7 @@ DecompressDataReader::decompressThreadContinuous( entry->decompressed + reader->overflowBytes, entry->decompressedSize - reader->overflowBytes, &decompressedWritten, first ? StartMultiBlock : ContinueMultiBlock); if (!ok) { - WriteErrorMessage("Failed to decompress BAM file at offset %lld\n", reader->inner->getFileOffset()); + WriteErrorMessage("Failed to decompress gzip/BAM file at offset %lld\n", reader->inner->getFileOffset()); soft_exit(1); } _ASSERT(compressedRead == entry->compressedValid && decompressedWritten <= reader->extraBytes - reader->overflowBytes); diff --git a/SNAPLib/DataReader.h b/SNAPLib/DataReader.h index eb251b1c..9283783f 100644 --- a/SNAPLib/DataReader.h +++ b/SNAPLib/DataReader.h @@ -98,7 +98,7 @@ class DataReader // read bytes from the beginning of the file for the header virtual char* readHeader(_int64* io_headerSize) = 0; - // seek to a particular range in the file + // seek to a particular range in the file. Set amountOfFileToProcess to go to EOF. virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) = 0; // get all remaining data in current batch diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp index 8863ec52..5239a11c 100644 --- a/SNAPLib/FASTA.cpp +++ b/SNAPLib/FASTA.cpp @@ -28,6 +28,7 @@ Revision History: #include "Error.h" #include "exit.h" #include "Util.h" +#include "DataReader.h" using namespace std; @@ -220,9 +221,10 @@ ReadFASTAGenome( isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true; isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true; - FILE *fastaFile = fopen(fileName, "r"); - if (fastaFile == NULL) { - WriteErrorMessage("Unable to open FASTA file '%s' (does it exist and do you have permission to open it?)\n",fileName); + DataReader* fastaDataReader = getDefaultOrGzipDataReader(fileName); + + if (fastaDataReader == NULL) { + WriteErrorMessage("Unable to open FASTA file '%s' (does it exist and do you have permission to open it?)\n", fileName); return NULL; } @@ -244,7 +246,7 @@ ReadFASTAGenome( int nextContigNumber = 0; - while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, fastaFile)) { + while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, fastaDataReader)) { fileSize += strlen(lineBuffer); if (lineBuffer[0] == '>') { nContigs++; @@ -382,7 +384,7 @@ ReadFASTAGenome( genome->sortContigsByName(); genome->setUpContigNumbersByOriginalOrder(); - fclose(fastaFile); + delete fastaDataReader; delete [] paddingBuffer; delete [] lineBuffer; return genome; diff --git a/SNAPLib/FASTQ.cpp b/SNAPLib/FASTQ.cpp index 349a77a4..d0a069a4 100644 --- a/SNAPLib/FASTQ.cpp +++ b/SNAPLib/FASTQ.cpp @@ -391,19 +391,6 @@ PairedInterleavedFASTQReader::getNextReadPair(Read *read0, Read *read1) } bytesConsumed += FASTQReader::getReadFromBuffer(buffer + bytesConsumed, validBytes - bytesConsumed, read1, fileName, data, context); - // - // Validate the Read IDs. - // - if (read0->getIdLength() < 2 || memcmp(read0->getId() + read0->getIdLength() - 2, "/1", 2)) { - WriteErrorMessage("PairedInterleavedFASTQReader: first read of batch doesn't have ID ending with /1: '%.*s'\n", read0->getIdLength(), read0->getId()); - soft_exit(1); - } - - if (read1->getIdLength() < 2 || memcmp(read1->getId() + read1->getIdLength() - 2, "/2", 2)) { - WriteErrorMessage("PairedInterleavedFASTQReader: second read of batch doesn't have ID ending with /2: '%.*s'\n", read1->getIdLength(), read1->getId()); - soft_exit(1); - } - data->advance(bytesConsumed); return true; } @@ -421,10 +408,16 @@ PairedInterleavedFASTQReader::reinit(_int64 startingOffset, _int64 amountOfFileT // If we're not at the start of the file, we might have the tail end of a read that someone // who got the previous range will process; advance past that. This is fairly tricky because // there can be '@' signs in the quality string (and maybe even in read names?). - if (startingOffset != 0) { - if (!FASTQReader::skipPartialRecord(data)) { - return; - } + + if (startingOffset == 0) { + return; + } + + WriteErrorMessage("This should be dead code. If you get here there's a bug in SNAP, please report it.\n"); + soft_exit(1); + + if (!FASTQReader::skipPartialRecord(data)) { + return; } // @@ -642,9 +635,9 @@ PairedInterleavedFASTQReader::createPairedReadSupplierGenerator( { bool isStdin = !strcmp(fileName,"-"); - if (gzip || isStdin) { - //WriteStatusMessage("PairedInterleavedFASTQ using supplier queue\n"); + if (gzip || isStdin || true /* always use queue for PairedInterleavedFASTQ because otherwise we need to know the read ID format to seek into the middle of the file, but that's nonstandard. */) { DataSupplier *dataSupplier; + if (isStdin) { if (gzip) { dataSupplier = DataSupplier::GzipStdio; @@ -652,7 +645,11 @@ PairedInterleavedFASTQReader::createPairedReadSupplierGenerator( dataSupplier = DataSupplier::Stdio; } } else { - dataSupplier = DataSupplier::GzipDefault; + if (gzip) { + dataSupplier = DataSupplier::GzipDefault; + } else { + dataSupplier = DataSupplier::Default; + } } PairedReadReader *reader = PairedInterleavedFASTQReader::create(dataSupplier, fileName, @@ -662,6 +659,7 @@ PairedInterleavedFASTQReader::createPairedReadSupplierGenerator( delete reader; return NULL; } + ReadSupplierQueue *queue = new ReadSupplierQueue(reader); queue->startReaders(); return queue; diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 042bd794..a50201c8 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -543,7 +543,7 @@ Genome::getLocationOfContig(const char *contigName, GenomeLocation *location, In *location = contigsByName[mid].beginningLocation; } if (index != NULL) { - *index = InternalContigNumToInt(mid); + *index = InternalContigNumToInt(contigsByName[mid].internalContigNumber); } return true; } else if (c < 0) { diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index e2a7194c..41b5b5b1 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -38,6 +38,7 @@ Revision History: #include "exit.h" #include "Error.h" #include "directions.h" +#include "DataReader.h" using namespace std; @@ -158,6 +159,8 @@ GenomeIndex::runIndexer( unsigned *altLiftoverProjContigOffsets = NULL; char **altLiftoverProjCigar = NULL; + DataSupplier::ExpansionFactor = 50; // This is for the decompression of a gzipped FASTA/ALT file. FASTAs compress really well so we need a lot. + for (int n = 2; n < argc; n++) { if (strcmp(argv[n], "-s") == 0) { if (n + 1 < argc) { @@ -251,7 +254,7 @@ GenomeIndex::runIndexer( n++; } else if (!strcmp(argv[n], "-altContigFile")) { if (n + 1 < argc) { - FILE *inputFile = fopen(argv[n + 1], "r"); + DataReader* inputFile = getDefaultOrGzipDataReader(argv[n + 1]); if (NULL == inputFile) { WriteErrorMessage("Unable to open alt contig list file %s\n", argv[n + 1]); soft_exit(1); @@ -269,7 +272,7 @@ GenomeIndex::runIndexer( addToCountedListOfStrings(contigNameBuffer, &nAltOptIn, &altOptInList); } // while we have an input string. - fclose(inputFile); + delete inputFile; delete[] contigNameBuffer; } else { usage(); @@ -277,7 +280,7 @@ GenomeIndex::runIndexer( n++; } else if (!strcmp(argv[n], "-nonAltContigFile")) { if (n + 1 < argc) { - FILE *inputFile = fopen(argv[n + 1], "r"); + DataReader *inputFile = getDefaultOrGzipDataReader(argv[n + 1]); if (NULL == inputFile) { WriteErrorMessage("Unable to open non-alt contig list file %s\n", argv[n + 1]); soft_exit(1); @@ -295,7 +298,7 @@ GenomeIndex::runIndexer( addToCountedListOfStrings(contigNameBuffer, &nAltOptOut, &altOptOutList); } // while we have an input string. - fclose(inputFile); + delete inputFile; delete[] contigNameBuffer; } else { @@ -304,7 +307,8 @@ GenomeIndex::runIndexer( n++; } else if (!strcmp(argv[n], "-altLiftoverFile")) { if (n + 1 < argc) { - FILE* inputFile = fopen(argv[n + 1], "r"); + DataReader* inputFile = getDefaultOrGzipDataReader(argv[n + 1]); + if (NULL == inputFile) { WriteErrorMessage("Unable to open ALT liftover file %s\n", argv[n + 1]); soft_exit(1); @@ -395,7 +399,7 @@ GenomeIndex::runIndexer( altLiftoverProjCigar[i][projCigarLength] = '\0'; } - fclose(inputFile); + delete inputFile; delete[] altLiftoverBuffer; } else { diff --git a/SNAPLib/LandauVishkin.h b/SNAPLib/LandauVishkin.h index 3c62035f..7bc30c16 100644 --- a/SNAPLib/LandauVishkin.h +++ b/SNAPLib/LandauVishkin.h @@ -5,7 +5,12 @@ #include "exit.h" #include "Genome.h" +#ifdef LONG_READS +const int MAX_K = 1000; +#else // LONG_READS const int MAX_K = 63; +#endif // LONG_READS + const int ScoreAboveLimit = -1; // A score value that means "we didn't compute the score because it was above the score limit." const int TooBigScoreValue = 65536; // This is much bigger than any score we'll ever see diff --git a/SNAPLib/Read.h b/SNAPLib/Read.h index 28bf2bcb..6bc1490a 100644 --- a/SNAPLib/Read.h +++ b/SNAPLib/Read.h @@ -44,7 +44,7 @@ struct PairedAlignmentResult; //#define LONG_READS #ifdef LONG_READS -#define MAX_READ_LENGTH 400000 +#define MAX_READ_LENGTH 20000 #else #define MAX_READ_LENGTH 400 #endif @@ -219,7 +219,7 @@ class ReadWriterSupplier static ReadWriterSupplier* create(const FileFormat* format, DataWriterSupplier* dataSupplier, const Genome* genome, bool killIfTooSlowbool, bool emitInternalScore, char *internalScoreTag, - bool ignoreAlignmentAdjustmentsForOm); + bool ignoreAlignmentAdjustmentsForOm, int matchReward, int subPenalty, int gapOpenPenalty, int gapExtendPenalty); virtual ~ReadWriterSupplier() {} }; diff --git a/SNAPLib/ReadWriter.cpp b/SNAPLib/ReadWriter.cpp index 4a2ddee7..3f880645 100644 --- a/SNAPLib/ReadWriter.cpp +++ b/SNAPLib/ReadWriter.cpp @@ -34,7 +34,7 @@ Module Name: class SimpleReadWriter : public ReadWriter { public: - SimpleReadWriter(const FileFormat* i_format, DataWriter* i_writer, const Genome* i_genome, bool i_killIfTooSlow, bool i_emitInternalScore, char *i_internalScoreTag, bool i_ignoreAlignmentAdjustmentsForOm) + SimpleReadWriter(const FileFormat* i_format, DataWriter* i_writer, const Genome* i_genome, bool i_killIfTooSlow, bool i_emitInternalScore, char *i_internalScoreTag, bool i_ignoreAlignmentAdjustmentsForOm, int i_matchReward, int i_subPenalty, int i_gapOpenPenalty, int i_gapExtendPenalty) : format(i_format), writer(i_writer), genome(i_genome), killIfTooSlow(i_killIfTooSlow), lastTooSlowCheck(0), emitInternalScore(i_emitInternalScore), ignoreAlignmentAdjustmentsForOm(i_ignoreAlignmentAdjustmentsForOm) { if (emitInternalScore) { @@ -46,6 +46,8 @@ class SimpleReadWriter : public ReadWriter } else { internalScoreTag[0] = '\0'; } + + agc.init(i_matchReward, i_subPenalty, i_gapOpenPenalty, i_gapExtendPenalty); } virtual ~SimpleReadWriter() @@ -651,14 +653,18 @@ SimpleReadWriter::close() class SimpleReadWriterSupplier : public ReadWriterSupplier { public: - SimpleReadWriterSupplier(const FileFormat* i_format, DataWriterSupplier* i_dataSupplier, const Genome* i_genome, bool i_killIfTooSlow, bool i_emitInternalScore, char *i_internalScoreTag, bool i_ignoreAlignmentAdjustmentsForOm) + SimpleReadWriterSupplier(const FileFormat* i_format, DataWriterSupplier* i_dataSupplier, const Genome* i_genome, bool i_killIfTooSlow, bool i_emitInternalScore, char *i_internalScoreTag, bool i_ignoreAlignmentAdjustmentsForOm, int i_matchReward, int i_subPenalty, int i_gapOpenPenalty, int i_gapExtendPenalty) : format(i_format), dataSupplier(i_dataSupplier), genome(i_genome), killIfTooSlow(i_killIfTooSlow), emitInternalScore(i_emitInternalScore), - ignoreAlignmentAdjustmentsForOm(i_ignoreAlignmentAdjustmentsForOm) + ignoreAlignmentAdjustmentsForOm(i_ignoreAlignmentAdjustmentsForOm), + matchReward(i_matchReward), + subPenalty(i_subPenalty), + gapOpenPenalty(i_gapOpenPenalty), + gapExtendPenalty(i_gapExtendPenalty) { if (emitInternalScore) { if (strlen(i_internalScoreTag) != 2) { @@ -678,7 +684,7 @@ class SimpleReadWriterSupplier : public ReadWriterSupplier virtual ReadWriter* getWriter() { - return new SimpleReadWriter(format, dataSupplier->getWriter(), genome, killIfTooSlow, emitInternalScore, internalScoreTag, ignoreAlignmentAdjustmentsForOm); + return new SimpleReadWriter(format, dataSupplier->getWriter(), genome, killIfTooSlow, emitInternalScore, internalScoreTag, ignoreAlignmentAdjustmentsForOm, matchReward, subPenalty, gapOpenPenalty, gapExtendPenalty); } virtual void close() @@ -694,6 +700,10 @@ class SimpleReadWriterSupplier : public ReadWriterSupplier bool emitInternalScore; char internalScoreTag[3]; bool ignoreAlignmentAdjustmentsForOm; + int matchReward; + int subPenalty; + int gapOpenPenalty; + int gapExtendPenalty; }; ReadWriterSupplier* @@ -704,8 +714,12 @@ ReadWriterSupplier::create( bool killIfTooSlow, bool emitInternalScore, char *internalScoreTag, - bool ignoreAlignmentAdjustmentsForOm) + bool ignoreAlignmentAdjustmentsForOm, + int matchReward, + int subPenalty, + int gapOpenPenalty, + int gapExtendPenalty) { - return new SimpleReadWriterSupplier(format, dataSupplier, genome, killIfTooSlow, emitInternalScore, internalScoreTag, ignoreAlignmentAdjustmentsForOm); + return new SimpleReadWriterSupplier(format, dataSupplier, genome, killIfTooSlow, emitInternalScore, internalScoreTag, ignoreAlignmentAdjustmentsForOm, matchReward, subPenalty, gapOpenPenalty, gapExtendPenalty); } diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp index 46fa0249..e0a049bc 100644 --- a/SNAPLib/SAM.cpp +++ b/SNAPLib/SAM.cpp @@ -782,7 +782,8 @@ SAMReader::parseLocation( char* field[], size_t fieldLength[], unsigned rfield, - unsigned posfield) + unsigned posfield, + int *o_pos) { unsigned oneBasedOffsetWithinContig = 0; if ('*' != field[rfield][0] && '*' != field[posfield][0]) { @@ -804,12 +805,23 @@ SAMReader::parseLocation( WriteErrorMessage("SAMReader: Unable to parse position when it was expected.\n"); soft_exit(1); } + if (0 == oneBasedOffsetWithinContig) { WriteErrorMessage("SAMReader: Position parsed as 0 when it was expected.\n"); soft_exit(1); } + + if (NULL != o_pos) { + *o_pos = oneBasedOffsetWithinContig; + } + return locationOfContig + oneBasedOffsetWithinContig - 1; // -1 is because our offset is 0 based, while SAM is 1 based. } else { + + if (NULL != o_pos) { + *o_pos = 0; + } + return InvalidGenomeLocation; } } @@ -1008,75 +1020,67 @@ SAMFormat::getSortInfo( if (o_readBytes != NULL) { *o_readBytes = (unsigned) lineLength; } - if (lengths[SAMReader::POS] == 0 || fields[SAMReader::POS][0] == '*') { - if (lengths[SAMReader::PNEXT] == 0 || fields[SAMReader::PNEXT][0] == '*') { - if (o_location != NULL) { - *o_location = UINT32_MAX; - } - if (o_refID != NULL) { - *o_refID = OriginalContigNum(INT32_MAX); // So that it sorts to the end - } + GenomeLocation locationBuffer; + if (o_location == NULL) { + o_location = &locationBuffer; // We need to call parseLocation to figure out pos, so if we're not returning the value just store it on the stack and discard + } - if (o_pos != NULL) { - *o_pos = 0; - } - } else { - const size_t contigNameBufferSize = 512; // We do a static buffer with reallocation so that in the usual case there is no dynamic memory allocation. If you have enormous contig names, you'll just run a little slower - char contigNameBuffer[contigNameBufferSize]; - char *contigName = contigNameBuffer; - GenomeLocation locationOfContig; - size_t neededSize; - InternalContigNum internalContigNum; - if (0 != (neededSize = SAMReader::parseContigName(genome, contigName, contigNameBufferSize, &locationOfContig, &internalContigNum, fields, lengths, SAMReader::RNEXT))) { - // - // Need a bigger buffer. - // - contigName = new char[neededSize]; - if (0 != SAMReader::parseContigName(genome, contigName, neededSize, &locationOfContig, &internalContigNum, fields, lengths, SAMReader::RNEXT)) { - WriteErrorMessage("SAMFormat::getSortInfo: reallocated buffer size is still too small.\n"); // This really shouldn't happen - soft_exit(1); - } - } + // + // Fill these in to indicate whether we're sorting by the aligned location or by next (which we do for unaligned reads so they land by their mate pair). + // + int posField; + int rField; - if (o_location != NULL) { - *o_location = SAMReader::parseLocation(locationOfContig, fields, lengths, SAMReader::RNEXT, SAMReader::PNEXT); - } + if (lengths[SAMReader::RNAME] == 0 || fields[SAMReader::RNAME][0] == '*') { + if (lengths[SAMReader::RNEXT] == 0 || fields[SAMReader::RNEXT][0] == '*' || fields[SAMReader::RNEXT][0] == '=') { // The check for = means it's unaligned because we only get here if RNAME is '*' + if (o_location != NULL) { + *o_location = InvalidGenomeLocation; + } if (o_refID != NULL) { - *o_refID = genome->getContigByInternalNumber(internalContigNum)->originalContigNumber; + *o_refID = OriginalContigNum(INT32_MAX); // So that it sorts to the end } - if (contigName != contigNameBuffer) { - delete[] contigName; + if (o_pos != NULL) { + *o_pos = 0; } - } + + return; + } else { + posField = SAMReader::PNEXT; + rField = SAMReader::RNEXT; + } } else { - const size_t contigNameBufferSize = 512; // We do a static buffer with reallocation so that in the usual case there is no dynamic memory allocation. If you have enormous contig names, you'll just run a little slower - char contigNameBuffer[contigNameBufferSize]; - char *contigName = contigNameBuffer; - size_t neededSize; - GenomeLocation locationOfContig; - InternalContigNum internalContigNum; - if (0 != (neededSize = SAMReader::parseContigName(genome, contigName, contigNameBufferSize, &locationOfContig, &internalContigNum, fields, lengths))) { - contigName = new char[neededSize]; - if (0 != SAMReader::parseContigName(genome, contigName, neededSize, &locationOfContig, &internalContigNum, fields, lengths)) { - WriteErrorMessage("SAMFormat::getSortInfo(2): reallocated buffer size is still too small.\n"); - soft_exit(1); - } + posField = SAMReader::POS; + rField = SAMReader::RNAME; + } + + const size_t contigNameBufferSize = 512; // We do a static buffer with reallocation so that in the usual case there is no dynamic memory allocation. If you have enormous contig names, you'll just run a little slower + char contigNameBuffer[contigNameBufferSize]; + char *contigName = contigNameBuffer; + GenomeLocation locationOfContig; + size_t neededSize; + InternalContigNum internalContigNum; + if (0 != (neededSize = SAMReader::parseContigName(genome, contigName, contigNameBufferSize, &locationOfContig, &internalContigNum, fields, lengths, rField))) { + // + // Need a bigger buffer. + // + contigName = new char[neededSize]; + if (0 != SAMReader::parseContigName(genome, contigName, neededSize, &locationOfContig, &internalContigNum, fields, lengths, rField)) { + WriteErrorMessage("SAMFormat::getSortInfo: reallocated buffer size is still too small.\n"); // This really shouldn't happen + soft_exit(1); } + } - if (o_location != NULL) { - *o_location = SAMReader::parseLocation(locationOfContig, fields, lengths); - } + *o_location = SAMReader::parseLocation(locationOfContig, fields, lengths, rField, posField, o_pos); - if (o_refID != NULL) { - *o_refID = genome->getContigByInternalNumber(internalContigNum)->originalContigNumber; - } + if (o_refID != NULL) { + *o_refID = genome->getContigByInternalNumber(internalContigNum)->originalContigNumber; + } - if (contigName != contigNameBuffer) { - delete[] contigName; - } + if (contigName != contigNameBuffer) { + delete[] contigName; } } @@ -1174,7 +1178,8 @@ SAMFormat::getWriterSupplier( } else { dataSupplier = DataWriterSupplier::create(options->outputFile.fileName, options->writeBufferSize, options->emitInternalScore, options->internalScoreTag); } - return ReadWriterSupplier::create(this, dataSupplier, genome, options->killIfTooSlow, options->emitInternalScore, options->internalScoreTag, options->ignoreAlignmentAdjustmentsForOm); + return ReadWriterSupplier::create(this, dataSupplier, genome, options->killIfTooSlow, options->emitInternalScore, options->internalScoreTag, options->ignoreAlignmentAdjustmentsForOm, + options->matchReward, options->subPenalty, options->gapOpenPenalty, options->gapExtendPenalty); } bool @@ -1627,24 +1632,20 @@ SAMFormat::writePairs( cigar[whichRead] = "*"; editDistance[whichRead] = -1; result->direction[whichRead] = FORWARD; - } - else { + } else { if (addFrontClipping < 0) { // Insertion (soft-clip) cumulativePositiveAddFrontClipping[firstOrSecond] += addFrontClipping; if (result->direction[whichRead] == FORWARD) { reads[whichRead]->setAdditionalFrontClipping(-cumulativePositiveAddFrontClipping[firstOrSecond]); - } - else { + } else { reads[whichRead]->setAdditionalBackClipping(-cumulativePositiveAddFrontClipping[firstOrSecond]); } - } - else { // Deletion + } else { // Deletion locations[whichRead] += addFrontClipping; } } } - } - else { + } else { cigar[whichRead] = computeCigarString(context.genome, lv, cigarBuf[whichRead], cigarBufSize, cigarBufWithClipping[whichRead], cigarBufWithClippingSize, clippedData[whichRead], clippedLength[whichRead], basesClippedBefore[whichRead], extraBasesClippedBefore[whichRead], basesClippedAfter[whichRead], read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), locations[whichRead], result->direction[whichRead], useM, @@ -1664,8 +1665,7 @@ SAMFormat::writePairs( cigar[whichRead] = "*"; editDistance[whichRead] = -1; result->direction[whichRead] = FORWARD; - } - else { + } else { if (addFrontClipping > 0) { cumulativePositiveAddFrontClipping[firstOrSecond] += addFrontClipping; reads[whichRead]->setAdditionalFrontClipping(cumulativePositiveAddFrontClipping[firstOrSecond]); @@ -1789,8 +1789,7 @@ SAMFormat::writePairs( soft_exit(1); } snprintf(libraryString, libraryStringSize, "\tLB:Z:%.*s", (int)libraryLength, library); - } - else { + } else { libraryString[0] = '\0'; } @@ -2395,8 +2394,7 @@ SAMFormat::computeCigar( // mapping, we'll refine it later if needed. // *o_extraBasesClippedAfter = genomeLocation + dataLength - (contig->beginningLocation + contig->length - genome->getChromosomePadding()); - } - else { + } else { *o_extraBasesClippedAfter = 0; } @@ -2527,18 +2525,23 @@ SAMFormat::computeCigarString( char clipAfter[16] = {'\0'}; char hardClipBefore[16] = {'\0'}; char hardClipAfter[16] = {'\0'}; + if (frontHardClipping > 0) { snprintf(hardClipBefore, sizeof(hardClipBefore), "%uH", frontHardClipping); } + if (basesClippedBefore + extraBasesClippedBefore > 0) { snprintf(clipBefore, sizeof(clipBefore), "%lluS", basesClippedBefore + extraBasesClippedBefore); } + if (basesClippedAfter + extraBasesClippedAfter > 0) { snprintf(clipAfter, sizeof(clipAfter), "%lluS", basesClippedAfter + extraBasesClippedAfter); } + if (backHardClipping > 0) { snprintf(hardClipAfter, sizeof(hardClipAfter), "%uH", backHardClipping); } + snprintf(cigarBufWithClipping, cigarBufWithClippingLen, "%s%s%s%s%s", hardClipBefore, clipBefore, cigarBuf, clipAfter, hardClipAfter); validateCigarString(genome, cigarBufWithClipping, cigarBufWithClippingLen, @@ -2595,16 +2598,14 @@ SAMFormat::computeCigarString( if (*o_editDistance == -2) { WriteErrorMessage("WARNING: computeGlobalScore returned -2; cigarBuf may be too small\n"); return "*"; - } - else if (*o_editDistance == -1) { + } else if (*o_editDistance == -1) { static bool warningPrinted = false; if (!warningPrinted) { WriteErrorMessage("WARNING: computeGlobalScore returned -1; this shouldn't happen\n"); warningPrinted = true; } return "*"; - } - else { + } else { // There may be a better way to do this. For now, whenever we see tail insertions, soft-clip them basesClippedAfter += backClippingMissedByLV; dataLength -= backClippingMissedByLV; @@ -2619,18 +2620,23 @@ SAMFormat::computeCigarString( char clipAfter[16] = { '\0' }; char hardClipBefore[16] = { '\0' }; char hardClipAfter[16] = { '\0' }; + if (frontHardClipping > 0) { snprintf(hardClipBefore, sizeof(hardClipBefore), "%uH", frontHardClipping); } + if (basesClippedBefore + extraBasesClippedBefore > 0) { snprintf(clipBefore, sizeof(clipBefore), "%lluS", basesClippedBefore + extraBasesClippedBefore); } + if (basesClippedAfter + extraBasesClippedAfter > 0) { snprintf(clipAfter, sizeof(clipAfter), "%lluS", basesClippedAfter + extraBasesClippedAfter); } + if (backHardClipping > 0) { snprintf(hardClipAfter, sizeof(hardClipAfter), "%uH", backHardClipping); } + snprintf(cigarBufWithClipping, cigarBufWithClippingLen, "%s%s%s%s%s", hardClipBefore, clipBefore, cigarBuf, clipAfter, hardClipAfter); validateCigarString(genome, cigarBufWithClipping, cigarBufWithClippingLen, @@ -2660,6 +2666,7 @@ SAMFormat::getRefSpanFromCigar(const char * cigarBuf, int cigarBufLen, int* refS while ('0' <= *nextChunkOfCigar && '9' >= *nextChunkOfCigar) { nextChunkOfCigar++; } + nextChunkOfCigar++; while ('\0' != *nextChunkOfCigar) { fieldsScanned = sscanf(nextChunkOfCigar, "%d%c", &len, &op); @@ -2682,13 +2689,17 @@ SAMFormat::printRead(const Genome* genome, Read* read, GenomeLocation location) if (read) { const char* read_data = read->getUnclippedData(); const char* readId = read->getId(); + for (unsigned i = 0; i < read->getIdLength(); ++i) { printf("%c", readId[i]); } + printf(","); + for (unsigned i = 0; i < read->getUnclippedLength(); ++i) { printf("%c", read_data[i]); } + printf("\n"); printf("Aligned location %s:%llu\n", genome->getContigAtLocation(location)->name, @@ -2704,15 +2715,13 @@ SAMFormat::validateCigarString( const char *nextChunkOfCigar = cigarBuf; GenomeDistance offsetInData = 0; const char *reference = genome->getSubstring(genomeLocation, dataLength); + if (NULL == reference) { - #ifdef _DEBUG - WriteErrorMessage("validateCigarString: couldn't look up genome data for location %lld\n", genomeLocation.location); - #else - WriteErrorMessage("validateCigarString: couldn't look up genome data for location %lld\n", genomeLocation); - #endif + WriteErrorMessage("validateCigarString: couldn't look up genome data for location %lld\n", GenomeLocationAsInt64(genomeLocation)); printRead(genome, read, genomeLocation); soft_exit(1); } + GenomeDistance offsetInReference = 0; bool sawNonH = false; // This is to make sure that the clipping types (H & S) occur only at the beginning or end of the cigar string. bool sawTailS = false; // Did we see a S @@ -3250,6 +3259,7 @@ SAMFilter::updateSAMLine( char* prev = fields[SAMReader::OPT]; // // We remove either the QS: or LB: fields at the end of the SAM record to make space for the new flag. Otherwise we will run out of space in buffer. + // XXX: This is just awful and wrong. We must fix this. // size_t lengthOPTExcludingLastField = 0; bool seenPG = false; @@ -3260,8 +3270,18 @@ SAMFilter::updateSAMLine( } prev = p; } + + // + // Trim a trailing \t if it's there. + // + if (lengthOPTExcludingLastField > 0 && fields[SAMReader::OPT][lengthOPTExcludingLastField - 1] == '\t') { + lengthOPTExcludingLastField--; + } + lengths[SAMReader::OPT] = lengthOPTExcludingLastField; + + // FIXME: needs to be changed if more mandatory SAM fields are added int charsInString; if (seenPG) { @@ -3278,8 +3298,7 @@ SAMFilter::updateSAMLine( (int)lengths[SAMReader::SEQ], fields[SAMReader::SEQ], (int)lengths[SAMReader::QUAL], fields[SAMReader::QUAL], (int)lengths[SAMReader::OPT], fields[SAMReader::OPT]); - } - else { + } else { charsInString = snprintf(toBuffer + *toUsed, lineLength, "%.*s\t%d\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\t%.*s\tPG:Z:SNAP\t%.*s\n", (int)lengths[SAMReader::QNAME], fields[SAMReader::QNAME], flag, @@ -3298,7 +3317,7 @@ SAMFilter::updateSAMLine( *toUsed += charsInString; if (charsInString > bytes || charsInString > lineLength) { - WriteErrorMessage("charsInString %d is larger than bytes %lld, linelength %lld\n", charsInString, bytes, lineLength); + WriteErrorMessage("SAMFilter::updateSAMLine: charsInString %d is larger than bytes %lld, linelength %lld\n", charsInString, bytes, lineLength); soft_exit(1); } } @@ -3585,14 +3604,6 @@ class SAMDupMarkFilter : public SAMFilter buffer = NULL; } -#ifdef USE_DEVTEAM_OPTIONS - if (mates.size() > 0) { - WriteErrorMessage("duplicate matching ended with %d unmatched reads:\n", mates.size()); - for (MateMap::iterator i = mates.begin(); i != mates.end(); i = mates.next(i)) { - WriteErrorMessage("%u%s/%u%s\n", GenomeLocationAsInt64(i->key.locations[0]), i->key.isRC[0] ? "rc" : "", GenomeLocationAsInt64(i->key.locations[1]), i->key.isRC[1] ? "rc" : ""); - } - } -#endif run.clear(); runFragment.clear(); } @@ -3642,8 +3653,7 @@ SAMDupMarkFilter::updateBuffer(size_t lastByte) if (!offsets[i].isDuplicate) { memcpy(buffer + bufferUsed, currentBuffer + offsets[i].offset, recordSize); bufferUsed += recordSize; - } - else { + } else { updateSAMLine(buffer, &bufferUsed, currentBuffer + offsets[i].offset, recordSize); } } @@ -3740,8 +3750,7 @@ SAMDupMarkFilter::onNextBatch( runFragment.push_back(entryFragment); } - } - else { + } else { // // Track the read from which we need to start the next run. Next run starts at nextBatchStart // @@ -3778,8 +3787,7 @@ SAMDupMarkFilter::onNextBatch( runFragment.push_back(entryFragment); } - } - else { + } else { // // We are done with the run. Begin marking duplicates // @@ -3815,8 +3823,7 @@ SAMDupMarkFilter::onNextBatch( offsets.clear(); - } - else if (offsets[unfinishedRunStart].offset == 0) { // we did not yet mark duplicates for this run + } else if (offsets[unfinishedRunStart].offset == 0) { // we did not yet mark duplicates for this run // // If we have a different run from what we have seen before, // simply mark all duplicates in the run we currently have @@ -3830,13 +3837,11 @@ SAMDupMarkFilter::onNextBatch( memcpy(currentBuffer + offsets[0].offset, buffer, bufferUsed); offsets.clear(); - } - else { + } else { *needMoreBuffer = true; return 0; } - } - else { + } else { bytesRead = offsets[unfinishedRunStart].offset; updateBuffer(bytesRead); @@ -3851,7 +3856,9 @@ SAMDupMarkFilter::onNextBatch( OffsetInfo oinfo(offsets[i].offset - offsets[unfinishedRunStart].offset, offsets[i].isDuplicate); nextBatchOffsets.push_back(oinfo); } + offsets.clear(); + for (size_t i = 0; i < nextBatchOffsets.size(); i++) { offsets.push_back(nextBatchOffsets[i]); } @@ -3982,8 +3989,7 @@ SAMDupMarkFilter::dupMarkBatch(size_t runStartIndex) { info = &fragments[key]; //fprintf(stderr, "add %u%s/%u%s -> %d\n", key.locations[0], key.isRC[0] ? "rc" : "", key.locations[1], key.isRC[1] ? "rc" : "", mates.size()); info->isMateMapped = (i->flag & SAM_MULTI_SEGMENT) != 0 && (i->flag & SAM_NEXT_UNMAPPED) == 0; - } - else { + } else { info = &f->value; } bool mateMapped = (i->flag & SAM_MULTI_SEGMENT) != 0 && (i->flag & SAM_NEXT_UNMAPPED) == 0; @@ -4000,12 +4006,10 @@ SAMDupMarkFilter::dupMarkBatch(size_t runStartIndex) { info->setBestReadId(i->qName); info->isMateMapped = true; info->setBestTileXY(tile, x, y); - } - else { + } else { info->checkBestRecord(i, totalQuality, tile, x, y, offsets[offsetIndex].isDuplicate); } - } - else { + } else { // // No best read pair found so far. // diff --git a/SNAPLib/SAM.h b/SNAPLib/SAM.h index 3dfe9817..d4543306 100644 --- a/SNAPLib/SAM.h +++ b/SNAPLib/SAM.h @@ -125,7 +125,7 @@ class SAMReader : public ReadReader { size_t contigNameBufferSize, GenomeLocation * o_locationOfContig, InternalContigNum* o_indexOfContig, char* field[], size_t fieldLength[], unsigned rfield = RNAME); // Returns 0 on success, needed contigNameBufferSize otherwise. - static GenomeLocation parseLocation(GenomeLocation locationOfContig, char* field[], size_t fieldLength[], unsigned rfield = RNAME, unsigned posfield = POS); + static GenomeLocation parseLocation(GenomeLocation locationOfContig, char* field[], size_t fieldLength[], unsigned rfield = RNAME, unsigned posfield = POS, int *o_pos = NULL); virtual bool getNextRead(Read *read, AlignmentResult *alignmentResult, GenomeLocation *genomeLocation, Direction *direction, unsigned *mapQ, unsigned *flag, bool ignoreEndOfRange, const char **cigar); diff --git a/SNAPLib/Util.cpp b/SNAPLib/Util.cpp index 9cdb7903..b2b4b164 100644 --- a/SNAPLib/Util.cpp +++ b/SNAPLib/Util.cpp @@ -26,6 +26,7 @@ Revision History: #include "Util.h" #include "Error.h" #include "GenericFile.h" +#include "DataReader.h" _int64 FirstPowerOf2GreaterThanOrEqualTo(_int64 value) @@ -274,3 +275,100 @@ char *reallocatingFgetsGenericFile(char **buffer, int *io_bufferSize, GenericFil GenericFileFGetsObject fgetsObject(file); return genericReallocatingFgets(buffer, io_bufferSize, &fgetsObject); } + +class DataReaderFGetsObject : public FgetsObject +{ +public: + DataReaderFGetsObject(DataReader* _dataReader) : dataReader(_dataReader) {} + + virtual char* fgets(char* s, int size) { + char* buffer = NULL; + _int64 validBytes = 0; + _int64 offsetInBuffer = 0; + int bytesCopied = 0; + int amountAlreadyAdvanced = 0; + + while (bytesCopied < size - 1) { + if (offsetInBuffer >= validBytes) { + dataReader->advance(offsetInBuffer); + + if (!dataReader->getData(&buffer, &validBytes)) { + if (dataReader->isEOF()) { + if (bytesCopied != 0) { + s[bytesCopied] = '\0'; + return s; + } + return NULL; + } + dataReader->nextBatch(); + + if (!dataReader->getData(&buffer, &validBytes)) { + if (dataReader->isEOF()) { + if (bytesCopied != 0) { + s[bytesCopied] = '\0'; + return s; + } + return NULL; + } + WriteErrorMessage("DataReaderFGetsObject: getData failed not at EOF\n"); + soft_exit(1); + } // getData failed (twice) + + offsetInBuffer = 0; + } // getData failed (once) + } // If we're past the end of the buffer + + s[bytesCopied] = buffer[offsetInBuffer]; + offsetInBuffer++; + bytesCopied++; + + if (s[bytesCopied-1] == '\n') { + break; + } + + + } // while we haven't run out of space to copy into (or hit a newline, which exits in the middle of the loop) + + s[bytesCopied] = '\0'; + dataReader->advance(offsetInBuffer); + + return s; + } // fgets + + +private: + DataReader* dataReader; +}; + +char* reallocatingFgets(char** buffer, int* io_bufferSize, DataReader* stream) +{ + DataReaderFGetsObject fgetsObject(stream); + return genericReallocatingFgets(buffer, io_bufferSize, &fgetsObject); +} + +bool isFilenameGzip(const char* filename) +{ + return util::stringEndsWith(filename, ".gz") || util::stringEndsWith(filename, ".gzip"); +} + +DataReader* getDefaultOrGzipDataReader(const char* filename) +{ + DataSupplier* dataSupplier; + size_t filenameLen = strlen(filename); + if (isFilenameGzip(filename)) { + dataSupplier = DataSupplier::GzipDefault; + } else { + dataSupplier = DataSupplier::Default; + } + + DataReader* dataReader = dataSupplier->getDataReader(3, 1024, 10.0, 1024 * 1024); + + if (!dataReader->init(filename)) { + delete dataReader; + return NULL; + } + + dataReader->reinit(0, 0); + + return dataReader; +} // getDefaultOrGzipDataReader \ No newline at end of file diff --git a/SNAPLib/Util.h b/SNAPLib/Util.h index 3c6df11e..1aa3a101 100644 --- a/SNAPLib/Util.h +++ b/SNAPLib/Util.h @@ -6,6 +6,7 @@ #include "Tables.h" #include "exit.h" #include "GenericFile.h" + using std::max; using std::min; @@ -541,7 +542,12 @@ class NWaiter // Version of fgets that dynamically (re-)allocates the buffer to be big enough to fit the whole line // char *reallocatingFgets(char **buffer, int *io_bufferSize, FILE *stream); +class DataReader; // Need this to avoid an include file loop +char* reallocatingFgets(char** buffer, int* io_bufferSize, DataReader *stream); char *reallocatingFgetsGenericFile(char **buffer, int *io_bufferSize, GenericFile *file); #define xassert(cond) {if (!(cond)) {WriteErrorMessage("xassert failed at %s:%d\n", __FILE__, __LINE__); *((volatile int*) NULL) = 42; }} // An assert that gets checked even when not in the debug build. +bool isFilenameGzip(const char* filename); +DataReader* getDefaultOrGzipDataReader(const char* filename); + diff --git a/docs/QuickStart.docx b/docs/QuickStart.docx index f566218d..da31c184 100644 Binary files a/docs/QuickStart.docx and b/docs/QuickStart.docx differ diff --git a/docs/QuickStart.pdf b/docs/QuickStart.pdf index a65fb375..de3873d8 100644 Binary files a/docs/QuickStart.pdf and b/docs/QuickStart.pdf differ diff --git a/docs/SNAP User Manual.docx b/docs/SNAP User Manual.docx index 2ac9f5a9..137f3eda 100644 Binary files a/docs/SNAP User Manual.docx and b/docs/SNAP User Manual.docx differ diff --git a/docs/SNAP User Manual.pdf b/docs/SNAP User Manual.pdf index a4d8e5f5..fbbfe7a9 100644 Binary files a/docs/SNAP User Manual.pdf and b/docs/SNAP User Manual.pdf differ