diff --git a/src/convertNR.cpp b/src/convertNR.cpp index 394fc0c..48214ba 100644 --- a/src/convertNR.cpp +++ b/src/convertNR.cpp @@ -41,11 +41,12 @@ int main(int argc, char **argv) { bool verbose = false; bool debug = false; bool addAcc = false; + bool all_taxa = false; // --------------------- START ------------------------------------------------------------------ // Read command line params int c; - while ((c = getopt(argc, argv, "ahdvrl:g:t:i:o:")) != -1) { + while ((c = getopt(argc, argv, "ahdvrAl:g:t:i:o:")) != -1) { switch (c) { case 'h': usage(argv[0]); @@ -55,6 +56,8 @@ int main(int argc, char **argv) { verbose = true; break; case 'a': addAcc = true; break; + case 'A': + all_taxa = true; break; case 'l': list_filename = optarg; break; case 't': @@ -84,7 +87,8 @@ int main(int argc, char **argv) { parseNodesDmp(*nodes,nodes_file); nodes_file.close(); - if(list_filename.length()==0) { + if(all_taxa) {std::cerr << "Using all taxa, due to '-A' option." << std::endl;} + else if(list_filename.length()==0) { std::cerr << "No taxa list specified, using Archaea, Bacteria, and Viruses." << std::endl; include_ids.insert((uint64_t)2); include_ids.insert((uint64_t)2157); @@ -162,6 +166,9 @@ int main(int argc, char **argv) { inputfile.open(nr_filename); if(!inputfile.is_open()) { error("Could not open file " + nr_filename); exit(EXIT_FAILURE); } } + else { + nr_filename = "std::cin"; + } if(verbose) std::cerr << "Writing to file " << out_filename << std::endl; std::ofstream out_file; @@ -209,7 +216,7 @@ int main(int argc, char **argv) { if(nodes->count(lca)==0) { std::cerr << "Taxon ID " << lca << " not found in taxonomy!" << std::endl; continue; } uint64_t id = lca; while(nodes->count(id)>0 && id != 1) { - if(include_ids.count(id) > 0) { + if(all_taxa || include_ids.count(id) > 0) { keep = true; break; } @@ -264,6 +271,7 @@ void usage(char *progname) { fprintf(stderr, " -o FILENAME Name of output file.\n"); fprintf(stderr, "Optional arguments:\n"); fprintf(stderr, " -a Prefix taxon ID in database names with the first Accession.Ver\n"); + fprintf(stderr, " -A Use all taxids. This overrides -l and the default (only Archaea, Bacteria, and Viruses)\n"); fprintf(stderr, " -i FILENAME Name of NR file. If this option is not used, then the program will read from STDIN.\n"); fprintf(stderr, " -l FILENAME Name of file containing IDs of taxa that will be extracted from the NR file. The IDs must be contained in nodes.dmp.\n"); exit(EXIT_FAILURE); diff --git a/src/makefile b/src/makefile index fedc140..ef35c6a 100644 --- a/src/makefile +++ b/src/makefile @@ -73,10 +73,13 @@ convertNR: makefile bwt/mkbwt Config.o convertNR.o util.o $(BLASTOBJS) clean: - rm -f -v bwt/mkbwt bwt/mkfmi kaiju kaijux kaijup kaiju2krona mergeOutputs kaijuReport convertNR addTaxonNames ../bin/* + rm -f -v bwt/mkbwt bwt/mkfmi kaiju kaijux kaijup kaiju2krona mergeOutputs kaijuReport convertNR addTaxonNames find . -name "*.o" -delete $(MAKE) -C bwt/ clean +uninstall: + rm -fv ../bin/* + static: LDFLAGS = -static static: LDLIBS = $(LD_LIBS_STATIC) static: all diff --git a/util/makeDB.sh b/util/makeDB.sh index c09d820..aa6cb7e 100755 --- a/util/makeDB.sh +++ b/util/makeDB.sh @@ -12,6 +12,7 @@ db_refseq=0 db_progenomes=0 db_nr=0 db_euk=0 +db_nr_all=0 db_mar=0 db_plasmids=0 threadsBWT=5 @@ -37,6 +38,9 @@ echo echo "$s" -n NCBI BLAST non-redundant protein database \"nr\": echo "$tab" only Archaea, bacteria, and viruses echo +echo "$s" -N NCBI BLAST non-redundant protein database \"nr\": +echo "$tab" all taxa +echo echo "$s" -e NCBI BLAST non-redundant protein database \"nr\": echo "$tab" like -n, but additionally including fungi and microbial eukaryotes echo @@ -87,6 +91,9 @@ while :; do -e|--euk) db_euk=1 ;; + -N|--nra) + db_nr_all=1 + ;; -v|--viruses) db_viruses=1 ;; @@ -115,7 +122,7 @@ while :; do shift done -[ $db_plasmids -eq 1 -o $db_viruses -eq 1 -o $db_refseq -eq 1 -o $db_progenomes -eq 1 -o $db_nr -eq 1 -o $db_euk -eq 1 -o $db_mar -eq 1 ] || { echo "Error: Use one of the options -r, -p, -n, -v, -l, -m, or -e"; usage; exit 1; } +[ $db_plasmids -eq 1 -o $db_viruses -eq 1 -o $db_refseq -eq 1 -o $db_progenomes -eq 1 -o $db_nr -eq 1 -o $db_euk -eq 1 -o $db_nr_all -eq 1 -o $db_mar -eq 1 ] || { echo "Error: Use one of the options -r, -p, -n, -v, -l, -m, or -e"; usage; exit 1; } #check if necessary programs are in the PATH command -v awk >/dev/null 2>/dev/null || { echo Error: awk not found; exit 1; } @@ -188,7 +195,7 @@ then fi -if [ $db_nr -eq 1 -o $db_euk -eq 1 ] +if [ $db_nr -eq 1 -o $db_euk -eq 1 -o $db_nr_all -eq 1 ] then if [ $DL -eq 1 ] then @@ -217,6 +224,22 @@ then echo Kaiju only needs the files kaiju_db_nr_euk.fmi, nodes.dmp, and names.dmp. echo The remaining files can be deleted. echo + elif [ $db_nr_all -eq 1 ] + then + if [ $index_only -eq 0 ] + then + echo Converting NR file to Kaiju database + gunzip -c nr.gz | convertNR -t nodes.dmp -g prot.accession2taxid -a -o kaiju_db_nr_all.faa -A + fi + [ -r kaiju_db_nr_all.faa ] || { echo Missing file kaiju_db_nr_all.faa; exit 1; } + echo Creating BWT from Kaiju database + mkbwt -e $exponentSA_NR -n $threadsBWT -a ACDEFGHIKLMNPQRSTVWY -o kaiju_db_nr_all kaiju_db_nr_all.faa + echo Creating FM-index + mkfmi kaiju_db_nr_all + echo Done! + echo Kaiju only needs the files kaiju_db_nr_all.fmi, nodes.dmp, and names.dmp. + echo The remaining files can be deleted. + echo else if [ $index_only -eq 0 ] then