diff --git a/CHANGES b/CHANGES index b97bbbb3..03f498e3 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,5 @@ +Fixed a bug with genome generation for annotations without spliece junctions. +For STARlong, increased compilation-time max read length to 500000 and max number of exons to 1000 Implemented on the fly insertion of the extra sequences into the genome indexes. Implemented --alignEndsType Extend3pOfRead1 option for full extension of the 3' end of read 1. Fixed a bug in the --alignEndsType Extend5pOfRead1 option. diff --git a/bin/Linux_x86_64/STAR b/bin/Linux_x86_64/STAR index 3cd5c558..aace3cd7 100755 Binary files a/bin/Linux_x86_64/STAR and b/bin/Linux_x86_64/STAR differ diff --git a/bin/Linux_x86_64/STARlong b/bin/Linux_x86_64/STARlong index 1ff8324a..0f805101 100755 Binary files a/bin/Linux_x86_64/STARlong and b/bin/Linux_x86_64/STARlong differ diff --git a/bin/Linux_x86_64_static/STAR b/bin/Linux_x86_64_static/STAR index af17be89..00ddf3f6 100755 Binary files a/bin/Linux_x86_64_static/STAR and b/bin/Linux_x86_64_static/STAR differ diff --git a/bin/Linux_x86_64_static/STARlong b/bin/Linux_x86_64_static/STARlong index 5d7a6419..7e0a7e44 100755 Binary files a/bin/Linux_x86_64_static/STARlong and b/bin/Linux_x86_64_static/STARlong differ diff --git a/extras/scripts/sjFromSAMcollapseUandM_inclOverlaps.awk b/extras/scripts/sjFromSAMcollapseUandM_inclOverlaps.awk new file mode 100644 index 00000000..cc731a49 --- /dev/null +++ b/extras/scripts/sjFromSAMcollapseUandM_inclOverlaps.awk @@ -0,0 +1,55 @@ +BEGIN { + OFS="\t"; + mapqU=255; +} +{ +if (substr($1,1,1)!="@") { + + m=and($2,0x80)/0x80+1; + + if ($1!=readNameOld) delete readSJs; + readNameOld=$1; + + n=split($6,L,/[A-Z]/)-1; + split($6,C,/[0-9]*/); + t=1;g=$4; + for (k=1;k<=n;k++) {#scan through CIGAR operations + if (C[k+1]=="S" || C[k+1]=="I") { + t+=L[k]; + } else if (C[k+1]=="D") { + g+=L[k]; + } else if (C[k+1]=="N") { + sj1=$3 "\t" g "\t" g+L[k]-1; + readSJs[sj1]++; + + if (readSJs[sj1]==1) {#only count this junction if it has nto been counted for the same read + SJ[sj1]=1; + if ($5>=mapqU) { + SJu[sj1]++; + } else { + SJm[sj1]++; + }; + }; + + if ($5>=mapqU) { + SJu1[sj1]++; + } else { + SJm1[sj1]++; + }; + + g+=L[k]; + + } else { # M operation + g+=L[k]; + t+=L[k]; + }; + }; +}; +}; +END { + +for (ii in SJ) { + print ii, SJu[ii]+0, SJm[ii]+0, SJu1[ii]+0, SJm1[ii]+0; +}; + +}; diff --git a/source/IncludeDefine.h b/source/IncludeDefine.h index b99617f1..90b056a6 100644 --- a/source/IncludeDefine.h +++ b/source/IncludeDefine.h @@ -104,7 +104,7 @@ typedef uint8_t uint8; #define MAX_N_MATES 2 #define DEF_readNameLengthMax 50000 #if defined COMPILE_FOR_LONG_READS - #define DEF_readSeqLengthMax 50000 + #define DEF_readSeqLengthMax 500000 #else #define DEF_readSeqLengthMax 500 #endif diff --git a/source/genomeGenerate.cpp b/source/genomeGenerate.cpp index 61732d0f..cb1983a2 100644 --- a/source/genomeGenerate.cpp +++ b/source/genomeGenerate.cpp @@ -405,6 +405,7 @@ void genomeGenerate(Parameters *P) { //write an extra 0 at the end of the array, filling the last bytes that otherwise are not accessible, but will be written to disk //this is - to avoid valgrind complaints. Note that SA2 is allocated with plenty of space to spare. + P->nSAbyte=mainGenome.SA.lengthByte; SA2.writePacked(P->nSA,0); };