From c4f818f58c6284ca018450427dfb624c10fe7cba Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Thu, 3 Sep 2015 11:20:39 -0400 Subject: [PATCH 1/6] Added usage statement and wget statement * `dbases.xml` was not found when I was running, it and I realized this is the file on the PubMLST site. * It was not clear until I read the code that all the statements are printed to the screen and so I put in a usage statement. --- sbin/mlst-download_pub_mlst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sbin/mlst-download_pub_mlst b/sbin/mlst-download_pub_mlst index f14b6e96..8fd6d948 100755 --- a/sbin/mlst-download_pub_mlst +++ b/sbin/mlst-download_pub_mlst @@ -1,9 +1,14 @@ #!/bin/bash +echo "Prints the commands to download the MLST databases." +echo "Usage: bash $0 | bash" + set -e OUTDIR=pubmlst +wget --update 'http://pubmlst.org/data/dbases.xml' >&2 + for URL in $(grep '' dbases.xml); do # echo $URL URL=${URL//} From ff354f45059e514d74bfaf69c38fc48de61a3850 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Thu, 15 Dec 2016 10:04:19 -0500 Subject: [PATCH 2/6] gff functionality --- bin/mlst | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/bin/mlst b/bin/mlst index 2bbf946d..89959cf6 100755 --- a/bin/mlst +++ b/bin/mlst @@ -2,8 +2,10 @@ use strict; use File::Spec; use Data::Dumper; +use List::Util qw(min max); use List::MoreUtils qw(pairwise); use File::Temp qw(tempfile); +use File::Basename qw(basename); use FindBin; use lib "$FindBin::RealBin/../perl5"; use MLST::PubMLST; @@ -23,7 +25,7 @@ my $OUTSEP = "\t"; # Command line options my(@Options, $debug, $quiet, $blastdb, $datadir, $threads, - $list, $longlist, $scheme, $minid, $mincov, $csv, $nopath, $novel); + $list, $longlist, $scheme, $minid, $mincov, $csv, $nopath, $novel, $gff); setOptions(); #.............................................................................. @@ -132,7 +134,8 @@ sub find_mlst { my @hit = qx($cmd); # FIXME: we should /sort/ the hits here in case logic below is dodgy? - my %res; + my %res; # schema => {locus => allele, locus => allele, ...} + my %feat; # schema => {locus => gff tab-delimited line, ... } my $res_count=0; foreach (@hit) { @@ -145,6 +148,9 @@ sub find_mlst { next unless $nident/$hlen >= $mincov/100 ; # need min-cov to reach minid + my $strand=($qstart < $qend)?"+":"-"; + my $gffLine=join("\t",$qid, basename($0)."_v$VERSION", "gene", min($qstart,$qend), max($qstart,$qend), sprintf("%0.0f", $nident/$alen), $strand, "ID=MLST_".$gene); + if ($scheme and $sch ne $scheme) { msg("Skipping $sch.$gene.$num allele as user specified --scheme $scheme"); next; @@ -153,10 +159,12 @@ sub find_mlst { if (exists $res{$sch}{$gene} and $res{$sch}{$gene} !~ m/[~?]/) { msg("WARNING: found addtional exact allele match $sch.$gene-$num"); $res{$sch}{$gene} .= ",$num"; + #$feat{$sch}{$gene}.= $gffLine; # might yield spurious annotations } else { msg("Found exact allele match $sch.$gene-$num"); $res{$sch}{$gene} = "$num"; + $feat{$sch}{$gene}= $gffLine.";Name=".$gene."_".$num; } # force update of hash with our exact match (see below for inexact behavior $novel{$qseq} = "$sch.$gene-$num $fname"; @@ -164,6 +172,7 @@ sub find_mlst { else { # either 100% length (with some SNPs) or partial coverage my $label = ($alen == $hlen) ? "~$num" : "${num}?"; $res{$sch}{$gene} ||= $label; + $feat{$sch}{$gene}||= $gffLine.";Name=".$gene."_".$label; if ($novel and $alen==$hlen) { # only update hash if we haven't seen a good version yet $novel{$qseq} ||= "$sch.$gene$label $fname"; @@ -194,11 +203,32 @@ sub find_mlst { # take the top scorer my @best = @{ $sig[0] }; + makeGff($gff,$feat{$best[0]}) if($gff); + return @best; } #---------------------------------------------------------------------- +sub makeGff{ + my($gff,$feat)=@_; + + # Sort by contig, and start site + my @feat=sort{ + my(@a)=split(/\t/,$a); + my(@b)=split(/\t/,$b); + $a[0] cmp $b[0] or + $a[3] <=> $b[3]; + } values(%$feat); + + open(my $gffFh, ">", $gff) or die "ERROR: could not write to $gff: $!"; + print $gffFh "##gff-version 3\n"; + for my $f(@feat){ + print $gffFh $f."\n"; + } + close $gffFh; +} + sub show_version { my(undef,undef,$exe) = File::Spec->splitpath($0); print "$exe $VERSION\n"; @@ -227,6 +257,7 @@ sub setOptions { {OPT=>"csv!", VAR=>\$csv, DEFAULT=>0, DESC=>"Output CSV instead of TSV"}, {OPT=>"nopath!", VAR=>\$nopath, DEFAULT=>0, DESC=>"Strip filename paths from FILE column"}, {OPT=>"novel=s", VAR=>\$novel, DEFAULT=>'', DESC=>"Save novel alleles to this FASTA file"}, + {OPT=>"gff=s", VAR=>\$gff, DEFAULT=>'', DESC=>"Output a GFF file"}, ); &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage(); From 12d9e78aba79379b5ad9f5c860e9c1ea08b8fa77 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Tue, 11 Apr 2017 16:33:52 -0400 Subject: [PATCH 3/6] an all-in-one --- scripts/mlst-make_blast_db.pl | 103 ++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100755 scripts/mlst-make_blast_db.pl diff --git a/scripts/mlst-make_blast_db.pl b/scripts/mlst-make_blast_db.pl new file mode 100755 index 00000000..9391b7f4 --- /dev/null +++ b/scripts/mlst-make_blast_db.pl @@ -0,0 +1,103 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename qw/fileparse basename/; +use Getopt::Long qw/GetOptions/; +use Bio::SeqIO; + +my($thisScript, $thisDir, $thisExt)=fileparse($0); + +my $mlstDir="$thisDir/../db/pubmlst"; +my $blastDir="$thisDir/../db/blast"; +my $blastFile="$blastDir/mlst.fa"; + +exit main(); + +sub main{ + my $settings={}; + GetOptions($settings, qw(help filter!)) or die $!; + $$settings{filter}//=1; + + die usage() if($$settings{help}); + + downloadPubMLST(); + combineBlastDb(); + + return 0; +} + +sub downloadPubMLST{ + mkdir $mlstDir; + system("wget --no-clobber -P '$mlstDir' http://pubmlst.org/data/dbases.xml"); + die "ERROR downloading dbases.xml" if $?; + + my $profile="MISSING"; + my $profileDir="MISSING"; + open(my $dbasesFh, "$mlstDir/dbases.xml") or die "ERROR could not read $mlstDir/dbases.xml: $!"; + while(my $url=<$dbasesFh>){ + # Remove the URL tags + $url=~s///; + $url=~s/<\/url>//; + chomp $url; + + # Either get the profiles txt or the + # alleles tfa files. + my $ext=substr($url,-4,4); # last four characters + if($ext=~/\.txt/){ + $profile=basename($url); + print "# $profile \n"; + $profileDir="$mlstDir/$profile"; + mkdir $profileDir; + + system("cd '$profileDir' && wget '$url'"); + die if $?; + } elsif($ext=~/\.tfa/){ + system("cd '$profileDir' && wget '$url'"); + die if $?; + } + } + close $dbasesFh; +} + +sub combineBlastDb{ + unlink($blastFile); + + my $seqout=Bio::SeqIO->new(-file=>">$blastFile"); + for my $schemePath(glob("$mlstDir/*")){ + next if(!-d $schemePath); + my $scheme=basename($schemePath); + print "$thisScript: ADDING: $scheme\n"; + + for my $tfa(glob("$schemePath/*.tfa")){ + my $seqin=Bio::SeqIO->new(-file=>$tfa, -format=>"fasta"); + while(my $seq=$seqin->next_seq){ + # Reformat the allele identifier so that it also has the scheme + my $id=$seq->id; + $id=~s/^/$scheme./; + $seq->id($id); + + # Place all filtering into this if block + if($$settings{filter}){ + # Do not accept ridiculous alleles + next if($seq->seq=~/^(A+|C+|G+|T+)$/); + } + $seqout->write_seq($seq); + } + $seqin->close; + } + } + $seqout->close; + + system("makeblastdb -hash_index -in \"$blastFile\" -dbtype nucl -title \"PubMLST\" -parse_seqids"); + die "ERROR with makeblastdb" if $?; +} + +sub usage{ + local $0=basename $0; + "$0: downloads the latest pubmlst databases. + To run, execute this file with no options. + + --filter Do not filter fake sequences + " +} From 79f155286f63987f87ebd0172919ff7fd50b2fba Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Thu, 13 Apr 2017 10:26:28 -0400 Subject: [PATCH 4/6] removed exit statement --- scripts/mlst-download_pub_mlst | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/mlst-download_pub_mlst b/scripts/mlst-download_pub_mlst index ce8cc687..270402d2 100755 --- a/scripts/mlst-download_pub_mlst +++ b/scripts/mlst-download_pub_mlst @@ -10,7 +10,6 @@ for URL in $(grep '' $OUTDIR/dbases.xml); do # echo $URL URL=${URL//} URL=${URL//<\/url>} -# echo ${URL: -4} if [ ${URL:(-4)} = ".txt" ]; then PROFILE=$(basename $URL .txt) echo "# $PROFILE " From 3f32cc9d0a072f65f21ccacc6be0e34fe0e4326f Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Thu, 13 Apr 2017 10:26:59 -0400 Subject: [PATCH 5/6] refactored so that all the work happens inside of subroutines --- scripts/mlst-make_blast_db.pl | 139 ++++++++++++++++++++++++---------- 1 file changed, 99 insertions(+), 40 deletions(-) diff --git a/scripts/mlst-make_blast_db.pl b/scripts/mlst-make_blast_db.pl index 9391b7f4..ab67d9cb 100755 --- a/scripts/mlst-make_blast_db.pl +++ b/scripts/mlst-make_blast_db.pl @@ -2,38 +2,48 @@ use strict; use warnings; -use File::Basename qw/fileparse basename/; +use File::Basename qw/fileparse basename dirname/; use Getopt::Long qw/GetOptions/; use Bio::SeqIO; +use File::Find qw/find/; +use FindBin qw/$RealBin/; -my($thisScript, $thisDir, $thisExt)=fileparse($0); - -my $mlstDir="$thisDir/../db/pubmlst"; -my $blastDir="$thisDir/../db/blast"; -my $blastFile="$blastDir/mlst.fa"; +local $0=$0; exit main(); sub main{ my $settings={}; - GetOptions($settings, qw(help filter!)) or die $!; + GetOptions($settings, qw(help filter! scheme=s)) or die $!; $$settings{filter}//=1; + $$settings{scheme}//=""; + + # Make our directory structure + my $mlstDir="$RealBin/../db/pubmlst"; + my $blastDir="$RealBin/../db/blast"; + mkdir dirname($blastDir); + mkdir dirname($mlstDir); + mkdir $mlstDir; + mkdir $blastDir; + my $blastFile="$RealBin/mlst.fa"; die usage() if($$settings{help}); - downloadPubMLST(); - combineBlastDb(); + downloadPubMLST($mlstDir,$$settings{scheme}); + combineBlastDb($blastFile,$mlstDir); return 0; } sub downloadPubMLST{ + my($mlstDir,$scheme)=@_; + mkdir $mlstDir; system("wget --no-clobber -P '$mlstDir' http://pubmlst.org/data/dbases.xml"); die "ERROR downloading dbases.xml" if $?; - my $profile="MISSING"; - my $profileDir="MISSING"; + my $profile=""; + my $profileDir=""; open(my $dbasesFh, "$mlstDir/dbases.xml") or die "ERROR could not read $mlstDir/dbases.xml: $!"; while(my $url=<$dbasesFh>){ # Remove the URL tags @@ -41,19 +51,38 @@ sub downloadPubMLST{ $url=~s/<\/url>//; chomp $url; + next if($url=~/^ $profileDir/\n"; mkdir $profileDir; - system("cd '$profileDir' && wget '$url'"); + system("cd '$profileDir' && wget '$url' 2> /dev/null"); die if $?; } elsif($ext=~/\.tfa/){ - system("cd '$profileDir' && wget '$url'"); + # Don't download the tfa files unless the profile + # is defined + if(!$profile){ + print "WARNING: skipping $url\n"; + next; + } + print " $url => $profileDir/\n"; + system("cd '$profileDir' && wget '$url' 2> /dev/null"); die if $?; } } @@ -61,43 +90,73 @@ sub downloadPubMLST{ } sub combineBlastDb{ + my($blastFile,$mlstDir)=@_; + + # Start fresh with a new combined file unlink($blastFile); - my $seqout=Bio::SeqIO->new(-file=>">$blastFile"); - for my $schemePath(glob("$mlstDir/*")){ - next if(!-d $schemePath); - my $scheme=basename($schemePath); - print "$thisScript: ADDING: $scheme\n"; + # Use File::Find::find to locate all schemes and add them. + # To avoid using globals, use the sub{...} syntax. + find({no_chdir=>1, wanted=>sub{ + addScheme($blastFile); + } + }, $mlstDir); - for my $tfa(glob("$schemePath/*.tfa")){ - my $seqin=Bio::SeqIO->new(-file=>$tfa, -format=>"fasta"); - while(my $seq=$seqin->next_seq){ - # Reformat the allele identifier so that it also has the scheme - my $id=$seq->id; - $id=~s/^/$scheme./; - $seq->id($id); - - # Place all filtering into this if block - if($$settings{filter}){ - # Do not accept ridiculous alleles - next if($seq->seq=~/^(A+|C+|G+|T+)$/); - } - $seqout->write_seq($seq); - } - $seqin->close; - } - } - $seqout->close; system("makeblastdb -hash_index -in \"$blastFile\" -dbtype nucl -title \"PubMLST\" -parse_seqids"); die "ERROR with makeblastdb" if $?; } +# This function is called by File::Find::find. +# $File::Find::dir is the current directory name +# $_ is the current filename within that directory +# $File::Find::name is the complete pathname to the file. +sub addScheme{ + my($blastFile)=@_; + + my $schemePath=$File::Find::name; + + # Only accept directory names + return if(!-d $schemePath); + #my $depth = tr|/||; # count slashes to get depth + #next if($depth > 1); # -maxdepth 1 + + # Append + my $seqout=Bio::SeqIO->new(-file=>">>$blastFile"); + + my $scheme=basename($schemePath); + print "$0: ADDING: $schemePath\n"; + + for my $tfa(glob("$schemePath/*.tfa")){ + my $seqin=Bio::SeqIO->new(-file=>$tfa, -format=>"fasta"); + while(my $seq=$seqin->next_seq){ + # Reformat the allele identifier so that it also has the scheme + my $id=$seq->id; + $id=~s/^/$scheme./; + $seq->id($id); + + # Place all filtering into this if block + #if($$settings{filter}){ + # # Do not accept ridiculous alleles + # next if($seq->seq=~/^(A+|C+|G+|T+)$/); + #} + $seqout->write_seq($seq); + } + $seqin->close; + } + + # Flush the output so that makeblastdb is working on + # the total contents. + $seqout->close; +} + sub usage{ local $0=basename $0; "$0: downloads the latest pubmlst databases. To run, execute this file with no options. - --filter Do not filter fake sequences + --scheme '' Which schemes to download? + Default: all databases + For a listing, run `mlst --longlist | cut -f 1` " } From 5d66b6cfee8c2b91d34cb66c08e9b5eb7397e054 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Thu, 13 Apr 2017 10:30:54 -0400 Subject: [PATCH 6/6] brought back tseeman --- bin/mlst | 51 +-------------------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/bin/mlst b/bin/mlst index e786c6ab..88412c73 100755 --- a/bin/mlst +++ b/bin/mlst @@ -2,10 +2,8 @@ use strict; use File::Spec; use Data::Dumper; -use List::Util qw(min max); use List::MoreUtils qw(pairwise); use File::Temp qw(tempfile); -use File::Basename qw(basename); use FindBin; use lib "$FindBin::RealBin/../perl5"; use MLST::PubMLST; @@ -141,8 +139,7 @@ sub find_mlst { my @hit = qx($cmd); # FIXME: we should /sort/ the hits here in case logic below is dodgy? - my %res; # schema => {locus => allele, locus => allele, ...} - my %feat; # schema => {locus => gff tab-delimited line, ... } + my %res; my $res_count=0; foreach (@hit) { @@ -155,9 +152,6 @@ sub find_mlst { next unless $nident/$hlen >= $mincov/100 ; # need min-cov to reach minid - my $strand=($qstart < $qend)?"+":"-"; - my $gffLine=join("\t",$qid, basename($0)."_v$VERSION", "gene", min($qstart,$qend), max($qstart,$qend), sprintf("%0.0f", $nident/$alen), $strand, "ID=MLST_".$gene); - if ($scheme and $sch ne $scheme) { msg("Skipping $sch.$gene.$num allele as user specified --scheme $scheme"); next; @@ -170,12 +164,10 @@ sub find_mlst { if (exists $res{$sch}{$gene} and $res{$sch}{$gene} !~ m/[~?]/) { msg("WARNING: found addtional exact allele match $sch.$gene-$num"); $res{$sch}{$gene} .= ",$num"; - #$feat{$sch}{$gene}.= $gffLine; # might yield spurious annotations } else { msg("Found exact allele match $sch.$gene-$num"); $res{$sch}{$gene} = "$num"; - $feat{$sch}{$gene}= $gffLine.";Name=".$gene."_".$num; } # force update of hash with our exact match (see below for inexact behavior $novel{$qseq} = "$sch.$gene-$num $fname"; @@ -183,7 +175,6 @@ sub find_mlst { else { # either 100% length (with some SNPs) or partial coverage my $label = ($alen == $hlen) ? "~$num" : "${num}?"; $res{$sch}{$gene} ||= $label; - $feat{$sch}{$gene}||= $gffLine.";Name=".$gene."_".$label; if ($novel and $alen==$hlen) { # only update hash if we haven't seen a good version yet $novel{$qseq} ||= "$sch.$gene$label $fname"; @@ -218,32 +209,11 @@ sub find_mlst { # take the top scorer my @best = @{ $sig[0] }; - makeGff($gff,$feat{$best[0]}) if($gff); - return @best; } #---------------------------------------------------------------------- -sub makeGff{ - my($gff,$feat)=@_; - - # Sort by contig, and start site - my @feat=sort{ - my(@a)=split(/\t/,$a); - my(@b)=split(/\t/,$b); - $a[0] cmp $b[0] or - $a[3] <=> $b[3]; - } values(%$feat); - - open(my $gffFh, ">", $gff) or die "ERROR: could not write to $gff: $!"; - print $gffFh "##gff-version 3\n"; - for my $f(@feat){ - print $gffFh $f."\n"; - } - close $gffFh; -} - sub show_version { my(undef,undef,$exe) = File::Spec->splitpath($0); print "$exe $VERSION\n"; @@ -257,24 +227,6 @@ sub setOptions { use Getopt::Long; @Options = ( -<<<<<<< HEAD - {OPT=>"help", VAR=>\&usage, DESC=>"This help"}, - {OPT=>"debug!", VAR=>\$debug, DEFAULT=>0, DESC=>"Verbose debug output to stderr"}, - {OPT=>"version!", VAR=>\&show_version, DESC=>"Print version and exit"}, - {OPT=>"quiet!", VAR=>\$quiet, DEFAULT=>0, DESC=>"Quiet - no stderr output"}, - {OPT=>"blastdb=s", VAR=>\$blastdb, DEFAULT=>"$FindBin::RealBin/../db/blast/mlst.fa", DESC=>"BLAST database"}, - {OPT=>"datadir=s", VAR=>\$datadir, DEFAULT=>"$FindBin::RealBin/../db/pubmlst", DESC=>"PubMLST data"}, - {OPT=>"scheme=s", VAR=>\$scheme, DEFAULT=>'', DESC=>"Don't autodetect, force this scheme on all inputs"}, - {OPT=>"list!", VAR=>\$list, DEFAULT=>0, DESC=>"List available MLST scheme names"}, - {OPT=>"longlist!", VAR=>\$longlist, DEFAULT=>0, DESC=>"List allelles for all MLST schemes"}, - {OPT=>"minid=f", VAR=>\$minid, DEFAULT=>95, DESC=>"DNA %identity of full allelle to consider 'similar' [~]"}, - {OPT=>"mincov=f", VAR=>\$mincov, DEFAULT=>10, DESC=>"DNA %cov to report partial allele at all [?]"}, - {OPT=>"threads=i", VAR=>\$threads, DEFAULT=>1, DESC=>"Number of BLAST threads (suggest GNU Parallel instead)"}, - {OPT=>"csv!", VAR=>\$csv, DEFAULT=>0, DESC=>"Output CSV instead of TSV"}, - {OPT=>"nopath!", VAR=>\$nopath, DEFAULT=>0, DESC=>"Strip filename paths from FILE column"}, - {OPT=>"novel=s", VAR=>\$novel, DEFAULT=>'', DESC=>"Save novel alleles to this FASTA file"}, - {OPT=>"gff=s", VAR=>\$gff, DEFAULT=>'', DESC=>"Output a GFF file"}, -======= {OPT=>"help", VAR=>\&usage, DESC=>"This help"}, {OPT=>"version!", VAR=>\&show_version, DESC=>"Print version and exit"}, {OPT=>"debug!", VAR=>\$debug, DEFAULT=>0, DESC=>"Verbose debug output to stderr"}, @@ -292,7 +244,6 @@ sub setOptions { {OPT=>"csv!", VAR=>\$csv, DEFAULT=>0, DESC=>"Output CSV instead of TSV"}, {OPT=>"nopath!", VAR=>\$nopath, DEFAULT=>0, DESC=>"Strip filename paths from FILE column"}, {OPT=>"novel=s", VAR=>\$novel, DEFAULT=>'', DESC=>"Save novel alleles to this FASTA file"}, ->>>>>>> 39d28d370fbebcd9fe2929a7407f150c1e8ce350 ); &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();