Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 66 additions & 82 deletions TAXAassign.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/

HELPDOC=$( cat <<EOF
HELPDOC="
Script to annotate sequences at different taxonomic levels using NCBI's taxonomy

Usage:
Expand All @@ -94,18 +94,19 @@ Options:
-r Number of reference matches (Default: 10)
-m Minimum percentage identity in blastn (Default: 97)
-q Minimum query coverage in blastn (Default: 97)
-a Threshold at different taxonomic levels (Default:"-m,-m,-m,-m,-m,-m" where -m is the minimum percentage identity argument)
-a Threshold at different taxonomic levels (Default: \"-m,-m,-m,-m,-m,-m\" where -m is the minimum percentage identity argument)
The order is as follows: Phylum,Class,Order,Family,Genus,Species
For example, -a "60,70,80,95,95,97"
For example, -a \"60,70,80,95,95,97\"
-t Consensus threshold (Default: 90)

EOF
)
"

set -o errexit

CURRENT_DIR=$(pwd)

# = Parameters to set ============== #
LOGFILE="`pwd`/TAXAassign.log" # Where to save the log
LOGFILE="${CURRENT_DIR}/TAXAassign.log" # Where to save the log
BLASTN_DIR="/home/opt/ncbi-blast-2.2.28+/bin"; # Path where blastn is installed
BLASTDB_DIR="/home/opt/ncbi-blast-2.2.28+/db"; # Path where nt is installed
FASTA_FILE="" # This field should be empty
Expand All @@ -118,8 +119,6 @@ CONSENSUS_THRESHOLD=90
TAXONOMIC_LEVELS_THRESHOLD=""
# =/Parameters to set ============== #

CURRENT_DIR=`pwd`

# = Enable FP support ============== #
# By default, there is limited capability in bash to handle floating point
# operations. In this script bc is used to calculate the floating point operations.
Expand All @@ -133,10 +132,12 @@ function float_eval()
{
local stat=0
local result=0.0
if [[ $# -gt 0 ]]; then
result=$(echo "scale=$float_scale; $*" | bc -q 2>/dev/null)
if [[ $# -gt 0 ]] ; then
result=$(echo "scale=$float_scale; $*" | bc -q 2> /dev/null)
stat=$?
if [[ $stat -eq 0 && -z "$result" ]]; then stat=1; fi
if [[ $stat -eq 0 && -z "$result" ]] ; then
stat=1
fi
fi
echo $result
return $stat
Expand All @@ -148,10 +149,14 @@ function float_eval()
function float_cond()
{
local cond=0
if [[ $# -gt 0 ]]; then
cond=$(echo "$*" | bc -q 2>/dev/null)
if [[ -z "$cond" ]]; then cond=0; fi
if [[ "$cond" != 0 && "$cond" != 1 ]]; then cond=0; fi
if [[ $# -gt 0 ]] ; then
cond=$(echo "$*" | bc -q 2> /dev/null)
if [[ -z "$cond" ]] ; then
cond=0
fi
if [[ "$cond" != 0 && "$cond" != 1 ]] ; then
cond=0
fi
fi
local stat=$((cond == 0))
return $stat
Expand All @@ -166,45 +171,27 @@ function ceil () {
# =/Enable FP support ============== #


# Create directories if they don't exist yet
function create_dirs() {
local dir
for dir in "$@"
do
if [ ! -d "$dir" ]; then
mkdir "$dir"
fi
done
}

# Check if files exist
function check_prog() {
local prog
for prog in "$@"
do
if which $prog >/dev/null; then
TAXAassign_print 'Using ' $prog
else
echo "$prog not in your path" >&2; exit 1;
fi

for prog in "$@" ; do
if which $prog > /dev/null ; then
TAXAassign_print 'Using ' $prog
else
echo "$prog not in your path" >&2
exit 1
fi
done
}

function skip_gen_file(){
if [ -f "$1" ]; then
echo "true"
else
echo "false"
fi
function skip_gen_file() {
# Is it a file?
[[ -f "$1" ]] && echo "true" || echo "false"
}

function skip_gen_dir(){
if [ -d "$1" ]; then
echo "true"
else
echo "false"
fi
function skip_gen_dir() {
# Is it a directory?
[[ -d "$1" ]] && echo "true" || echo "false"
}

function TAXAassign_print() {
Expand All @@ -213,50 +200,50 @@ function TAXAassign_print() {


# Parse options
while getopts ":phc:r:m:f:t:q:a:" opt; do
while getopts ":phc:r:m:f:t:q:a:" opt ; do
case $opt in
p)
PARALLELIZE_FLAG=1
;;
f)
FASTA_FILE=$OPTARG
;;
m)
MINIMUM_PERCENT_IDENT=$OPTARG
;;
m)
MINIMUM_PERCENT_IDENT=$OPTARG
;;
c)
NUMBER_OF_CORES=$OPTARG
;;
r)
NUMBER_OF_REFERENCE_MATCHES=$OPTARG
;;
t)
CONSENSUS_THRESHOLD=$OPTARG
;;
q)
MINIMUM_QUERY_COVERAGE=$OPTARG
;;
a)
TAXONOMIC_LEVELS_THRESHOLD=$OPTARG
;;
t)
CONSENSUS_THRESHOLD=$OPTARG
;;
q)
MINIMUM_QUERY_COVERAGE=$OPTARG
;;
a)
TAXONOMIC_LEVELS_THRESHOLD=$OPTARG
;;
h)
echo "$HELPDOC"
echo "$HELPDOC" 1>&2
exit 0
;;
\?)
echo "$HELPDOC"
echo "Invalid option: -$OPTARG" >&2
echo "$HELPDOC" 1>&2
echo "Invalid option: -$OPTARG" 1>&2
exit 1
;;
esac
done
if [ -z $FASTA_FILE ]
then
echo "$HELPDOC"

if [ -z $FASTA_FILE ] ; then
echo "$HELPDOC" 1>&2
exit 1
fi

if [ -z "$TAXONOMIC_LEVELS_THRESHOLD" ]; then
if [ -z "$TAXONOMIC_LEVELS_THRESHOLD" ] ; then
TAXONOMIC_LEVEL_THRESHOLD="$MINIMUM_PERCENT_IDENT,$MINIMUM_PERCENT_IDENT,$MINIMUM_PERCENT_IDENT,$MINIMUM_PERCENT_IDENT,$MINIMUM_PERCENT_IDENT,$MINIMUM_PERCENT_IDENT"
fi

Expand All @@ -265,23 +252,21 @@ IFS=",";
TLTArray=($TAXONOMIC_LEVELS_THRESHOLD);
IFS=$OIFS;

if [ "${#TLTArray[@]}" != "6" ]; then
echo "$HELPDOC"
if [ "${#TLTArray[@]}" != "6" ] ; then
echo "$HELPDOC" 1>&2
exit 1
fi

for i in "${TLTArray[@]}"
do
:
if ! [[ $i =~ ^-?[0-9]+$ ]]; then
echo "$HELPDOC"
for i in "${TLTArray[@]}" ; do
if ! [[ $i =~ ^-?[0-9]+$ ]] ; then
echo "$HELPDOC" 1>&2
exit 1
fi
done


if ! [[ $MINIMUM_PERCENT_IDENT =~ ^-?[0-9]+$ ]] || ! [[ $NUMBER_OF_CORES =~ ^-?[0-9]+$ ]] || ! [[ $NUMBER_OF_REFERENCE_MATCHES =~ ^-?[0-9]+$ ]] || ! [[ $CONSENSUS_THRESHOLD =~ ^-?[0-9]+$ ]] || ! [[ $MINIMUM_QUERY_COVERAGE =~ ^-?[0-9]+$ ]]; then
echo "$HELPDOC"
echo "$HELPDOC" 1>&2
exit 1
fi

Expand All @@ -293,29 +278,27 @@ TAXAassign_print "TAXAassign v0.4. Copyright (c) 2013 Computational Microbial Ge
check_prog $BLASTN_DIR/blastn
check_prog $TAXAASSIGN_DIR/scripts/blast_concat_taxon.py
check_prog $TAXAASSIGN_DIR/scripts/blast_gen_assignments.pl
if [ $PARALLELIZE_FLAG -eq 1 ]
then
if [ $PARALLELIZE_FLAG -eq 1 ] ; then
check_prog parallel
fi


fileName=`echo "$(basename $FASTA_FILE)" | cut -d'.' -f1`
fileName=$(basename $FASTA_FILE | cut -d'.' -f1)

# Run blastn
TAXAassign_print "Blast against NCBI's nt database with minimum percent ident of $MINIMUM_PERCENT_IDENT%, maximum of $NUMBER_OF_REFERENCE_MATCHES reference sequences, and evalue of 0.0001 in blastn."
# Format for blastn
blastOutFmt="\"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs staxids\""
blastFileName=$fileName'_B'
if [ "$(skip_gen_file $blastFileName'.out')" == "true" ];then
if [ "$(skip_gen_file $blastFileName'.out')" == "true" ] ; then
TAXAassign_print $blastFileName'.out' already exists. Skipping this step.
elif [ $PARALLELIZE_FLAG -eq 1 ]; then

elif
[ $PARALLELIZE_FLAG -eq 1 ] ; then
# Get the file size in KB
sizeFileBytes=$(du -b ${FASTA_FILE} | sed 's/\([0-9]*\)\(.*\)/\1/')
sizeChunks=$(ceil $(float_eval "$sizeFileBytes / ($NUMBER_OF_CORES * 1024)"))
sizeChunksString="${sizeChunks}k"
startTime=`date +%s`

cat $FASTA_FILE | parallel --block $sizeChunksString --recstart '>' --pipe $BLASTN_DIR/blastn -perc_identity $MINIMUM_PERCENT_IDENT -evalue 0.00001 -dust no -num_threads 1 -outfmt $blastOutFmt -max_target_seqs $NUMBER_OF_REFERENCE_MATCHES -db $BLASTDB_DIR'/nt' -query - > $blastFileName'.out'

TAXAassign_print "blastn using GNU parallel took $(expr `date +%s` - $startTime) seconds for $FASTA_FILE".
Expand All @@ -333,7 +316,7 @@ blastFilteredFileName=$fileName'_BF'
if [ "$(skip_gen_file $blastFilteredFileName'.out')" == "true" ];then
TAXAassign_print $blastFilteredFileName'.out' already exists. Skipping this step.
else
cat $blastFileName'.out' | awk -F"\t" -v pattern=$MINIMUM_QUERY_COVERAGE '$13>pattern{print $0}' > $blastFilteredFileName'.out'
cat $blastFileName'.out' | awk -F"\t" -v pattern=$MINIMUM_QUERY_COVERAGE '$13>pattern{print $0}' > $blastFilteredFileName'.out'
TAXAassign_print $blastFilteredFileName'.out' generated successfully!
fi

Expand Down Expand Up @@ -400,3 +383,4 @@ else
TAXAassign_print "Sequences assigned at species level: $speciesLevelAssignments/$totalReads ($(float_eval "($speciesLevelAssignments / $totalReads) * 100")%)"
fi

exit 0