Skip to content

Commit a2a2d42

Browse files
author
Dominik R Laetsch
committed
More preparation
1 parent d9bfef5 commit a2a2d42

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+354
-15898
lines changed

.gitignore

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
data/nodesDB.txt
2-
tests/
2+
samtools/
3+
example/
34
*.pyc
45
*.png
5-
test_files/blobDB.table.txt
66
test.*
77
*.gz
88
*.fq

license.txt LICENSE.md

File renamed without changes.

MANIFEST.in

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
include README.md
2+
include requirements.txt

README.md

+32-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,33 @@
1-
# blobtools
2-
Application for the visualisation of (draft) genome assemblies using TAGC (Taxon-annotated Gc-Coverage) plots
1+
blobtools
2+
===============================
33

4-
For the documentation, please refer to https://blobtools.readme.io/
4+
version number: 1.0
5+
author: Dominik R Laetsch
6+
7+
Overview
8+
--------
9+
10+
A modular command-line solution for visualisation, quality control and taxonomic partitioning of genome datasets
11+
12+
Installation / Usage
13+
--------------------
14+
15+
To install use pip:
16+
17+
$ pip install blobtools
18+
19+
20+
Or clone the repo:
21+
22+
$ git clone https://github.com/DRL/blobtools.git
23+
$ python setup.py install
24+
25+
Contributing
26+
------------
27+
28+
TBD
29+
30+
Example
31+
-------
32+
33+
TBD

blobtools

-1
This file was deleted.

blobtools

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/usr/bin/env bash
2+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
3+
$DIR/lib/blobtools.py "$@"

install

+141
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env bash
2+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
3+
4+
echo "[+] Checking dependencies..."
5+
wget=$(which wget)
6+
if [ -x "$wget" ] ; then
7+
echo " [+] [wget] $wget"; else
8+
echo " [X] [wget] ... please install wget";
9+
fi
10+
tar=$(which tar)
11+
if [ -x "$tar" ] ; then
12+
echo " [+] [gunzip] $tar"; else
13+
echo " [X] [tar] ... please install tar";
14+
fi
15+
pip=$(which pip)
16+
if [ -x "$pip" ] ; then
17+
echo " [+] [pip] $pip"; else
18+
echo " [X] [pip] ... please install pip";
19+
fi
20+
python=$(which python)
21+
if [ -x "$python" ] ; then
22+
echo " [+] [python] $python"; else
23+
echo " [X] [python] ... please install python2.7";
24+
fi
25+
26+
# Install python dependencies
27+
echo "[+] Installing python dependencies..."
28+
$python setup.py install --quiet
29+
30+
# Create executable
31+
echo -n "[+] Creating BlobTools executable..."
32+
echo '#!/usr/bin/env bash
33+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
34+
$DIR/lib/blobtools.py "$@"' > $DIR/blobtools && chmod +x $DIR/blobtools
35+
echo "done."
36+
37+
# get samtools
38+
function download_samtools {
39+
echo -n "[+] Downloading samtools-1.5..."
40+
wget -qN https://github.com/samtools/samtools/releases/download/1.5/samtools-1.5.tar.bz2
41+
if [ $? -eq 0 ]; then
42+
echo "done."
43+
else
44+
echo "FAIL."
45+
echo "[X] - Could not download samtools-1.5. Please install samtools-1.5 in $DIR/samtools/ (see http://www.htslib.org/download/) "
46+
exit
47+
fi
48+
}
49+
50+
function install_samtools {
51+
echo -n "[+] Unpacking samtools-1.5..."
52+
$tar xjf samtools-1.5.tar.bz2
53+
if [ $? -eq 0 ]; then
54+
echo "done."
55+
else
56+
echo "FAIL."
57+
echo "[X] - Could not unpack samtools-1.5. Please install samtools-1.5 in $DIR/samtools/ (see http://www.htslib.org/download/) "
58+
exit
59+
fi
60+
samtools_src=$DIR/samtools-1.5/
61+
samtools_dir=$DIR/samtools/
62+
mkdir -p $samtools_dir
63+
cd $samtools_src
64+
echo -n "[+] Configuring samtools-1.5..."
65+
./configure --prefix=$samtools_dir --quiet
66+
if [ $? -eq 0 ]; then
67+
echo "done."
68+
else
69+
echo "FAIL."
70+
echo "[X] - Could not configure samtools-1.5. Please install samtools-1.5 in $DIR/samtools/ (see http://www.htslib.org/download/) "
71+
exit
72+
fi
73+
echo -n "[+] Compiling samtools-1.5..."
74+
make --silent
75+
make install --silent
76+
if [ $? -eq 0 ]; then
77+
echo "done."
78+
else
79+
echo "FAIL."
80+
echo "[X] - Could not compile samtools-1.5. Please install samtools-1.5 in $DIR/samtools/ (see http://www.htslib.org/download/) "
81+
exit
82+
fi
83+
echo "[+] cleaning up..."
84+
rm -f $DIR/samtools-1.5.tar.bz2
85+
rm -rf $samtools_src
86+
cd $DIR
87+
}
88+
89+
function download_taxdump {
90+
echo -n "[+] Downloading NCBI taxdump from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz ..."
91+
$wget -qN ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz -P $DIR/data/
92+
if [ $? -eq 0 ]; then
93+
echo "done."
94+
else
95+
echo "FAIL."
96+
echo "[X] - Could not download NCBI taxdump. Please follow installation steps on https://blobtools.readme.io/"
97+
exit
98+
fi
99+
}
100+
101+
function unpack_taxdump {
102+
echo -n "[+] Unpacking nodes/names.dmp ..."
103+
$tar zxf $DIR/data/taxdump.tar.gz -C $DIR/data/ nodes.dmp names.dmp
104+
if [ $? -eq 0 ]; then
105+
echo "done."
106+
else
107+
echo "FAIL."
108+
echo "[X] - Could not unpack nodes/names.dmp. Please follow installation steps on https://blobtools.readme.io/"
109+
exit
110+
fi
111+
}
112+
113+
# install samtools
114+
samtools_tar=$DIR/samtools-1.5.tar.bz2
115+
if [ ! -f "$samtools_tar" ]; then
116+
download_samtools
117+
install_samtools
118+
fi
119+
120+
# get taxdump
121+
taxdump=$DIR/data/taxdump.tar.gz
122+
if [ ! -f "$taxdump" ]; then
123+
download_taxdump
124+
unpack_taxdump
125+
fi
126+
127+
# make nodesdb
128+
./blobtools nodesdb --nodes $DIR/data/nodes.dmp --names $DIR/data/names.dmp
129+
if [ $? -eq 0 ]; then
130+
echo "[+] Removing intermediate files..."
131+
rm -f $taxdump
132+
rm -f $DIR/data/nodes.dmp
133+
rm -f $DIR/data/names.dmp
134+
else
135+
echo "[X] - Could not create nodesdb. Please follow installation steps on https://blobtools.readme.io/"
136+
exit
137+
fi
138+
139+
140+
# Done
141+
echo "[+] BlobTools was installed successfully. Please run ./blobtools"

bloblib/BtCore.py lib/BtCore.py

File renamed without changes.

bloblib/BtIO.py lib/BtIO.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import os
1313
import zlib
1414
from collections import defaultdict
15+
import blobtools
1516
from os.path import basename, isfile, splitext, join, isdir
1617
import shutil
1718
import lib.BtLog as BtLog
@@ -183,15 +184,13 @@ def checkBam(infile):
183184
print BtLog.status_d['10']
184185
if not isfile(infile):
185186
BtLog.error('0', infile)
186-
if not which('samtools'):
187-
BtLog.error('7')
188187
reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
189188
#reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
190189
#reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary")
191190
reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
192191
reads_total, reads_mapped = 0, 0
193192
output = ''
194-
command = "samtools flagstat " + infile
193+
command = blobtools.SAMTOOLS + " flagstat " + infile
195194
for line in runCmd(command=command):
196195
output += line
197196
reads_mapped = int(reads_mapped_re.search(output).group(1))
@@ -313,14 +312,14 @@ def parseBamForFilter(infile, include_unmapped, outfile, include, exclude, gzip,
313312
if not isfile(infile):
314313
BtLog.error('0', infile)
315314
if do_sort:
316-
command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile)
315+
command = blobtools.SAMTOOLS + ' sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile)
317316
runCmd(command=command, wait=True)
318317
infile = "%s.readsorted.bam" % infile
319318

320319
progress_unit = int(100000)
321320
#if progress_flag:
322321
# reads_total, reads_mapped = checkBam(infile)
323-
command = "samtools view -f 1 -F 256 -F 2048 %s" % infile
322+
command = blobtools.SAMTOOLS + " view -f 1 -F 256 -F 2048 %s" % infile
324323

325324
pair_count_by_type, pair_seqs_by_type, out_fs_by_type = init_read_pairs(outfile, include_unmapped, include, exclude)
326325
if include:
@@ -399,7 +398,7 @@ def parseBam(infile, set_of_blobs, no_base_cov_flag):
399398
read_cov_dict = {blob : 0 for blob in set_of_blobs}
400399
cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
401400
# execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
402-
command = "samtools view -F 1024 -F 4 -F 256 " + infile
401+
command = blobtools.SAMTOOLS + " view -F 1024 -F 4 -F 256 " + infile
403402
seen_reads = 0
404403
#import time
405404
#start = time.time()
@@ -578,8 +577,8 @@ def readTax(infile, set_of_blobs):
578577
except ValueError:
579578
BtLog.error('46', infile, col[2])
580579
if hitDict['name'] not in set_of_blobs:
581-
print BtLog.warn_d['13'] % (hitDict['name'], infile)
582-
#BtLog.error('19', hitDict['name'], infile)
580+
#print BtLog.warn_d['13'] % (hitDict['name'], infile)
581+
BtLog.error('19', hitDict['name'], infile)
583582
yield hitDict
584583
#hitDict = {
585584
# 'name' : match.group(1),

bloblib/BtLog.py lib/BtLog.py

+44-44
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ def progress(iteration, steps, max_value, no_limit=False):
2222
elif int(iteration) == max_value:
2323
if no_limit == True:
2424
sys.stdout.write('\r')
25-
print "[PROGRESS]\t: \t%d%%" % (100),
25+
print "[%%] \t%d%%" % (100),
2626
else:
2727
sys.stdout.write('\r')
28-
print "[PROGRESS]\t: \t%d%%" % (100)
28+
print "[%%] \t%d%%" % (100)
2929
elif int(iteration) % steps == 0:
3030
sys.stdout.write('\r')
31-
print "[PROGRESS]\t: \t%d%%" % (float(int(iteration) / int(max_value)) * 100),
31+
print "[%%] \t%d%%" % (float(int(iteration) / int(max_value)) * 100),
3232
sys.stdout.flush()
3333
else:
3434
pass
@@ -84,52 +84,52 @@ def progress(iteration, steps, max_value, no_limit=False):
8484
}
8585

8686
warn_d = {
87-
'0': '[WARN]\t\t: No tax files specified.',
88-
'1': '[WARN]\t\t: %s not in colour file %s ...',
89-
'2': '[WARN]\t\t: %s is not part of the assembly',
90-
'3': '\n[WARN]\t\t: Based on samtools flagstat: expected %s reads, %s reads were parsed',
91-
'4': '[WARN]\t\t: No coverage data found in %s',
92-
'5': '[WARN]\t\t: Hit for sequence %s in tax file %s has multiple taxIds, only first one is used.',
93-
'6': '[WARN]\t\t: Sum of coverage in cov lib %s is 0.0. Please ignore this warning if "--no_base_cov" was specified.',
94-
'7': '[WARN]\t\t: No taxonomy information found.',
95-
'8': '[WARN]\t\t: Duplicated sequences found :\n\t\t\t%s',
96-
'9': '[WARN]\t\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...',
97-
'10': '[WARN]\t\t: Line %s: sequence "%s" already has TaxID "%s". Skipped. (use --force to overwrite)',
98-
'11': '\n[WARN]\t\t: The BAM file appears to be truncated.',
99-
'12': '[WARN] : sseqid %s not found in ID-to-taxID mapping file %s.',
100-
'13': '[WARN]\t: Sequence %s in file %s is not part of the assembly.'
87+
'0': '[-] No tax files specified.',
88+
'1': '[-] %s not in colour file %s ...',
89+
'2': '[-] %s is not part of the assembly',
90+
'3': '\n[-] Based on samtools flagstat: expected %s reads, %s reads were parsed',
91+
'4': '[-] No coverage data found in %s',
92+
'5': '[-] Hit for sequence %s in tax file %s has multiple taxIds, only first one is used.',
93+
'6': '[-] Sum of coverage in cov lib %s is 0.0. Please ignore this warning if "--no_base_cov" was specified.',
94+
'7': '[-] No taxonomy information found.',
95+
'8': '[-] Duplicated sequences found :\n\t\t\t%s',
96+
'9': '[-] Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...',
97+
'10': '[-] Line %s: sequence "%s" already has TaxID "%s". Skipped. (use --force to overwrite)',
98+
'11': '\n[-] The BAM file appears to be truncated.',
99+
'12': '[-] sseqid %s not found in ID-to-taxID mapping file %s.',
100+
'13': '[-] Sequence %s in file %s is not part of the assembly.'
101101
}
102102
status_d = {
103-
'0': '[STATUS]\t: Nothing to be done. %s',
104-
'1': '[STATUS]\t: Parsing %s - %s',
105-
'2': '... Done',
106-
'3': '[STATUS]\t: Creating nodesDB from %s and %s',
107-
'4': '[STATUS]\t: names.dmp/nodes.dmp not specified. Retrieving nodesDB from %s',
108-
'5': '[STATUS]\t: Store nodesDB in default location %s',
109-
'6': '[STATUS]\t: Computing taxonomy using taxrule(s) %s',
110-
'7': '[STATUS]\t: Generating BlobDB and writing to file %s',
111-
'8': '[STATUS]\t: Plotting %s',
112-
'9': '[STATUS]\t: Reading BlobDB %s',
113-
'10': '[STATUS]\t: \tChecking with \'samtools flagstat\'',
114-
'11': '[STATUS]\t: \tMapping reads = %s, total reads = %s (mapping rate = %s)',
115-
'12': '[STATUS]\t: \tChecking with \'clc_mapping_info\'',
116-
'13': '[STATUS]\t: \tWriting %s',
117-
'14': '[STATUS]\t: Preparing view(s) ...',
118-
'15': '[STATUS]\t: \tLoading BlobDB into memory ...',
119-
'16': '[STATUS]\t: \tDeserialising BlobDB (using \'%s\' module) (this may take a while) ...',
120-
'17': '[STATUS]\t: \tDeserialising BlobDB (using \'%s\' module) (this may take a while, consider installing the \'ujson\' module) ...',
121-
'18': '[STATUS]\t: Extracting data for plots ...',
122-
'19': '[STATUS]\t: Writing output ...',
123-
'20': '[STATUS]\t: \tFinished in %ss',
124-
'22': '[STATUS]\t: Filtering %s ...',
125-
'23': '[STATUS]\t: Filtered %s (pairs=%s) ...',
126-
'24': '[STATUS]\t: Writing %s',
127-
'25': '[STATUS]\t: Gzip\'ing %s',
128-
'26': '[STATUS]\t: Reading %s'
103+
'0': '[+] Nothing to be done. %s',
104+
'1': '[+] Parsing %s - %s',
105+
'2': '[+] Done',
106+
'3': '[+] Creating nodesDB from %s and %s',
107+
'4': '[+] names.dmp/nodes.dmp not specified. Retrieving nodesDB from %s',
108+
'5': '[+] Store nodesDB in default location %s',
109+
'6': '[+] Computing taxonomy using taxrule(s) %s',
110+
'7': '[+] Generating BlobDB and writing to file %s',
111+
'8': '[+] Plotting %s',
112+
'9': '[+] Reading BlobDB %s',
113+
'10': '[+] \tChecking with \'samtools flagstat\'',
114+
'11': '[+] \tMapping reads = %s, total reads = %s (mapping rate = %s)',
115+
'12': '[+] \tChecking with \'clc_mapping_info\'',
116+
'13': '[+] \tWriting %s',
117+
'14': '[+] Preparing view(s) ...',
118+
'15': '[+] \tLoading BlobDB into memory ...',
119+
'16': '[+] \tDeserialising BlobDB (using \'%s\' module) (this may take a while) ...',
120+
'17': '[+] \tDeserialising BlobDB (using \'%s\' module) (this may take a while, consider installing the \'ujson\' module) ...',
121+
'18': '[+] Extracting data for plots ...',
122+
'19': '[+] Writing output ...',
123+
'20': '[+] \tFinished in %ss',
124+
'22': '[+] Filtering %s ...',
125+
'23': '[+] Filtered %s (pairs=%s) ...',
126+
'24': '[+] Writing %s',
127+
'25': '[+] Gzip\'ing %s',
128+
'26': '[+] Reading %s'
129129
}
130130

131131
info_d = {
132-
'0': '\t[INFO]\t: %s : sequences = %s, span = %s MB, N50 = %s nt'
132+
'0': '[I]\t%s : sequences = %s, span = %s MB, N50 = %s nt'
133133
}
134134

135135
if __name__ == "__main__":

bloblib/BtPlot.py lib/BtPlot.py

File renamed without changes.

bloblib/BtTax.py lib/BtTax.py

File renamed without changes.
File renamed without changes.

bloblib/bamfilter.py lib/bamfilter.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from sys import path
3232
path.append(dirname(dirname(abspath(__file__))))
3333

34-
import blobtools
34+
import lib.blobtools as blobtools
3535
import lib.BtLog as BtLog
3636
import lib.BtIO as BtIO
3737
import lib.BtCore as Bt

bloblib/blobplot.py lib/blobplot.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
from sys import path
6868
path.append(dirname(dirname(abspath(__file__))))
6969

70-
import blobtools
70+
import lib.blobtools as blobtools
7171
import lib.BtLog as BtLog
7272
import lib.BtIO as BtIO
7373
import lib.BtCore as BtCore

0 commit comments

Comments
 (0)