trinityhall_metagenomics/blaster.py at main · EnrightLab/trinityhall_metagenomics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import subprocess
import re

def align(first, second):
    retaln = ""
    for a, b in zip(first, second):
        if a == '-' or b == '-':
            retaln += " "
        elif a == b:
            retaln += "|"
        else:
            retaln += " "
    return retaln

blast_cmd_template = (
    "blastn -query XXXX -db YYYY "
    "-outfmt '7 qseqid sseqid pident mismatch gapopen qstart qend "
    "sstart send evalue bitscore length qlen qseq sseq' "
    "-word_size 8 -evalue 1e-5 -num_threads 12"
)

QUERY_PATTERN = re.compile(r"^# Query: (\S+)")
HITS_PATTERN = re.compile(r"^# (\d+) hits found")

with open("targets.txt", "r") as infile:
    for line in infile:
        line = line.strip()
        if not line:
            continue
        species, db = line.split("\t")
        species_file = species.replace(" ", "_") + ".metagenomic.fasta"
        print(f"{species_file} {db}")

        blast_cmd = blast_cmd_template.replace("XXXX", species_file).replace("YYYY", db)
        seen = set()
        total_hits = 0
        total_seqs = 0
        line_count = 0
        current_time = None

        with open(f"{species_file}.blasthits.txt", "w", buffering=1024*1024) as outfile:
            process = subprocess.Popen(blast_cmd, shell=True, stdout=subprocess.PIPE, text=True)
            current_query = None
            for line in process.stdout:
                line = line.strip()
                if not line:
                    continue

                line_count += 1
                if line_count % 100000 == 0:
                    last_time = current_time if current_time else blast_start
                    current_time = time.time()
                    elapsed = current_time - blast_start
                    difference = current_time - last_time
                    print(f"  Processed {line_count} lines in {elapsed:.2f} seconds, last 100k reads took {difference} seconds")
                    print(f"length of seen is {len(seen)}")

                query_match = QUERY_PATTERN.match(line)
                if query_match:
                    current_query = query_match.group(1)
                    continue

                hits_match = HITS_PATTERN.match(line)
                if hits_match:
                    if int(hits_match.group(1)) == 0:
                        outfile.write(f"{current_query}\tNo Hits\tNA\n")
                        total_seqs += 1
                    continue

                if not line.startswith("#") and current_query and current_query not in seen:
                    parts = line.split("\t")
                    qfrac = float(parts[11]) / float(parts[12])
                    qfrac_str = f"{qfrac * 100:.2f}%"
                    outfile.write(f"{current_query}\t{parts[1]}\t{parts[2]}\t{qfrac_str} [{parts[11]} / {parts[12]}]\t{parts[7]}\t{parts[8]}\t{parts[5]}\t{parts[6]}\n")

                    aln_marker = align(parts[13], parts[14])
                    qseq_chunks = [parts[13][i:i+80] for i in range(0, len(parts[13]), 80)]
                    sseq_chunks = [parts[14][i:i+80] for i in range(0, len(parts[14]), 80)]
                    aln_chunks = [aln_marker[i:i+80] for i in range(0, len(aln_marker), 80)]

                    for q, a, s in zip(qseq_chunks, aln_chunks, sseq_chunks):
                        outfile.write(f"#\n#[aln]\t{q}\n#[aln]\t{a}\n#[aln]\t{s}\n")
                    outfile.write("#\n")

                    seen.add(current_query)
                    total_hits += 1
                    total_seqs += 1

        print(f"Total Hits: {total_hits} in {total_seqs} (seqs)")