-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_ga_output.py
135 lines (111 loc) · 5.45 KB
/
evaluate_ga_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
import pickle
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO.FastaIO import SimpleFastaParser
from genetic_algorithm import create_gene_start_end_idxs, GA_utils
import matplotlib.pyplot as plt
from matplotlib import pylab
import os
def filter_stop_codons(gene):
codons = ["".join(gene[i:i+3]) for i in range(0, len(gene), 3)]
temp = [c for c in codons[:-1] if c not in ["TAG", "TAA", "TGA"]]
return "".join(temp + [codons[-1]])
def substitute_stop_codons(gene):
codons = ["".join(gene[i:i+3]) for i in range(0, len(gene), 3)]
temp = [("***" if c in {"TAG", "TAA", "TGA"} else c)
for c in codons[:-1]]
res = "".join(temp + [codons[-1]])
return res
def decode_solution_to_genes(gene_start_end_idxs, solution):
genes = [solution[start:end] for start, end in gene_start_end_idxs]
return [filter_stop_codons(gene) for gene in genes]
def decode_solution_to_unfiltered_genes(gene_start_end_idxs, solution):
genes = [solution[start:end] for start, end in gene_start_end_idxs]
return [substitute_stop_codons(gene) for gene in genes]
def plot_quantitative_virulence(original_fitness_vals, mutated_fitness_vals):
fig, ax = plt.subplots(figsize=(8, 5))
width = 0.35
inds = np.arange(len(original_fitness_vals))
ax.bar(inds, [-x for x, y in original_fitness_vals], width, color="blue")
ax.bar(inds + width,
[-x for x, y in mutated_fitness_vals], width, color="green")
ax.legend(("Original Sequence", "Mutated Sequence"))
fig.suptitle(
"Quantitative Virulence Before and After Attenuation by Genetic Algorithm")
pylab.ylabel("Quantitative Virulence")
plt.xticks(range(len(original_fitness_vals)),
range(1, 1 + len(original_fitness_vals)))
plt.xlabel(
"Coding Sequences in Order of Appearance in Genome")
plt.show()
print("Quantitative virulence values after mutation: ", mutated_fitness_vals)
def plot_intial_quantitative_virulence(original_fitness_vals):
fig, ax = plt.subplots(figsize=(8, 5))
width = 0.35
inds = np.arange(len(original_fitness_vals))
ax.bar(inds, [-x for x, y in original_fitness_vals], width, color="blue")
fig.suptitle(
"Initial Quantitative Virulence of Covid-19 Coding Sequences")
plt.xlabel(
"Coding Sequences in Order of Appearance in Genome")
pylab.ylabel("Quantitative Virulence")
plt.xticks(range(len(original_fitness_vals)),
range(1, 1 + len(original_fitness_vals)))
plt.show()
print("Initial quantitative virulence values: ", original_fitness_vals)
def write_solutions(base, fasta, top_ten_solutions):
seqs = [seq for fid, seq in SimpleFastaParser(open(fasta))]
ids = [fid for fid, seq in SimpleFastaParser(open(fasta))]
gene_start_end_idxs = create_gene_start_end_idxs(seqs)
count = 1
for fitness, solution in top_ten_solutions:
genes = decode_solution_to_genes(gene_start_end_idxs, solution)
records = [SeqRecord(Seq(gene), fid, '', '')
for fid, gene in zip(ids, genes)]
f = round(fitness, 5)
path = base / \
f"vaccine_candidates/solution_rank_{count}_fitness_{f}.fasta"
SeqIO.write(records, open(path, "w"), "fasta")
count += 1
def write_blast_report(base, dir1, f1):
f1 = "./data/covid19_most_virulent_3genes.fna"
f2 = Path(dir1) / os.listdir(dir1)[-1]
cmd = f"blastn -query {f1} -subject {f2} -outfmt 7 -out {base}/blast.txt"
os.system(cmd)
def generate_quantitative_virulence_graph1(fasta):
original_genes = [seq for fid, seq in SimpleFastaParser(open(fasta))]
ga_util_obj = create_GA_util_obj(fasta)
og = [ga_util_obj.get_gene_fitness(gene) for gene in original_genes]
plot_intial_quantitative_virulence(og)
def generate_quantitative_virulence_graph2(solution, fasta):
original_genes = [seq for fid, seq in SimpleFastaParser(open(fasta))]
gene_start_end_idxs = create_gene_start_end_idxs(original_genes)
mutated_genes = decode_solution_to_unfiltered_genes(
gene_start_end_idxs, solution)
ga_util_obj = create_GA_util_obj(fasta)
og = [ga_util_obj.get_gene_fitness(gene) for gene in original_genes]
mut = [ga_util_obj.get_gene_fitness(
gene.replace("*", "")) for gene in mutated_genes]
plot_quantitative_virulence(og, mut)
def create_GA_util_obj(genome_path):
victors_scores = "./saved_models/victors_xgboost_scores.joblib"
victors_model_path = "./saved_models/victors_xgboost_model.joblib"
protegen_scores = "./saved_models/protegen_xgboost_scores.joblib"
protegen_model_path = "./saved_models/protegen_xgboost_model.joblib"
return GA_utils(victors_scores, protegen_scores,
victors_model_path, protegen_model_path, genome_path)
if __name__ == "__main__":
base = Path("./genetic_algorithm_output")
most_virulent_fasta = Path("./data/covid19_most_virulent_3genes.fna")
full_fasta = Path("./data/covid19_coding_sequences.fna")
generate_quantitative_virulence_graph1(full_fasta)
input_path = base / "best_ten_solutions_150gens_10000pop_1000mates.pkl"
output_dir = base / "vaccine_candidates"
top_ten_solutions = pickle.load(open(input_path, "rb"))
best_sol = top_ten_solutions[0][1]
write_solutions(base, most_virulent_fasta, top_ten_solutions)
write_blast_report(base, output_dir, most_virulent_fasta)
generate_quantitative_virulence_graph2(best_sol, most_virulent_fasta)