-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblast.py
30 lines (23 loc) · 1.34 KB
/
blast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# Chris Koch, 2020.
import os
from pathlib import Path
# Note that we use a really high E-value here. This is to catch any sequence that is even remotely similar - we will be filtering all sequences with greater than 30% percent identity and that will be our set of negative training examples. This is not supposed to be a typical blast search, it's more used as a way to calculate all of the possible percent identities in the dataset.
def run_blast(path, db, output_dir):
output_dir.mkdir(parents=True, exist_ok=True)
out = output_dir / path.stem
cmd = f"blastp -query {path.absolute()} -out {out} -db {db} -num_threads 16 -outfmt 7 -evalue 1000"
os.system(cmd)
def make_blast_db(path):
cmd = f"makeblastdb -in {path} -dbtype prot -out {path} -title {path.name}"
os.system(cmd)
if __name__ == "__main__":
# Make the database. Sometimes this will throw an error on Windows - Google it and follow steps to fix system settings.
path_db = Path("./data/uniprot_sprot.fasta")
make_blast_db(path_db)
# Run Blast on Victors and Protegen proteins.
db = "./data/uniprot_sprot.fasta"
out = "./data/blast_output/"
paths = ["./data/victors_all.faa", "./data/protegen_bacteria_proteins.faa",
"./data/protegen_virus_proteins.faa"]
for input_path in paths:
run_blast(Path(input_path), db, Path(out))