-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.pyx
More file actions
123 lines (108 loc) · 4.11 KB
/
utils.pyx
File metadata and controls
123 lines (108 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from collections import defaultdict
import os
os.environ["OMP_NUM_THREADS"] = '1' # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = '1' # export OPENBLAS_NUM_THREADS=4
os.environ["MKL_NUM_THREADS"] = '1' # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = '1' # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = '1' # export NUMEXPR_NUM_THREADS=6
import numpy as np
cpdef readBinaryPhy(str fname):
site_dict = {}#defaultdict()
cdef list alphabet, taxa_list
cdef int n_leaves, n_sites, n_chars
cdef str line, taxa, char_vector, ch, header
alphabet, taxa_list = [], []
f = open(fname)
header = f.readline().strip()
n_leaves, n_sites = map(int,header.split(" "))
alphabet = ["0", "1"]
for line in f:
if len(line.strip()) < 1:
continue
taxa, char_vector = line.strip().split()
taxa = taxa.replace(" ","")
assert len(char_vector) == n_sites
for ch in char_vector:
if ch not in alphabet and ch not in ["?", "-"]:
alphabet.append(ch)
site_dict[taxa] = char_vector
taxa_list.append(taxa)
f.close()
n_chars = len(alphabet)
ll_mats= sites2Mat(site_dict, n_chars, alphabet, taxa_list)
#print(ll_mats)
return n_leaves, n_chars, alphabet, site_dict, ll_mats,taxa_list, n_sites
def readPhy(fname):
site_dict = {}#defaultdict()
alphabet, taxa_list = [], []
f = open(fname)
header = f.readline().strip()
n_leaves, n_sites = map(int,header.split(" "))
for line in f:
if len(line.strip()) < 1:
continue
taxa, char_vector = line.strip().split("\t")
taxa = taxa.replace(" ","")
#char_vector = char_vector.replace(" ","")
print(taxa, char_vector)
for ch in char_vector.split(" "):
temp_ch = ch.split("/")
for tch in temp_ch:
if tch not in alphabet and tch not in ["?", "-"]:
alphabet.append(tch)
site_dict[taxa] = char_vector.split(" ")
taxa_list.append(taxa)
f.close()
n_chars = len(alphabet)
ll_mats= sites2Mat(site_dict, n_chars, alphabet, taxa_list)
return n_leaves, n_chars, alphabet, site_dict, ll_mats, taxa_list, n_sites
def readMultiPhy(fname):
site_dict = {}#defaultdict()
alphabet, taxa_list = [], []
f = open(fname)
header = f.readline().strip()
n_leaves, n_sites = map(int,header.split(" "))
for line in f:
if len(line.strip()) < 1:
continue
#taxa, char_vector = line.strip().split("\t")
taxa, char_vector = line.strip().split()
taxa = taxa.replace(" ","")
#char_vector = char_vector.replace(" ","")
#print(taxa, char_vector)
for ch in char_vector:#.split(" "):
if ch not in alphabet and ch not in ["?", "-"]:
alphabet.append(ch)
site_dict[taxa] = list(char_vector)
taxa_list.append(taxa)
f.close()
n_chars = len(alphabet)
ll_mats= sites2Mat(site_dict, n_chars, alphabet, taxa_list)
return n_leaves, n_chars, alphabet, site_dict, ll_mats, taxa_list, n_sites
cpdef sites2Mat(dict sites, int n_chars, list alphabet, list taxa_list):
ll_mat = defaultdict(list)
cdef int k_idx
for k, v in sites.items():
for ch in v:
if ch in ["?", "-"]:
x = np.ones(n_chars)
#x = np.ones(n_chars)/n_chars
elif "/" in ch:
y = ch.split("/")
x = np.zeros(n_chars)
for t in y:
idx = alphabet.index(t)
x[idx] = 1.0
else:
x = np.zeros(n_chars)
idx = alphabet.index(ch)
x[idx] = 1.0
ll_mat[k].append(x)
cdef dict LL_MAT = {}
for k, v in ll_mat.items():
k_idx = taxa_list.index(k)+1
LL_MAT[k_idx] = np.array(v, order="F").T#np.ascontiguousarray(np.array(v).T, dtype=np.float32)
#ll_mat[k] = np.array(v).T
#print(k, np.array(v))
#print(ll_mat[k].flags)
return LL_MAT