-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutil.py
193 lines (156 loc) · 6.18 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import pandas as pd
import numpy as np
import math
from shutil import copyfile
from scipy.stats import ttest_ind
import chardet
import numpy as np
from sys import exit
from collections import defaultdict
# version 1.0 author: Lijia Che
# initial implementation
# version 1.1 author: Yiqi Jiang
# add warning log, genarated group information from metadata, convert mpa to newick
# version 1.2 author: Yiqi Jiang
# add filter_metadata_att
def delete_zero_row(data):
# print filtered taxonomy
filtered_data = data[data.apply(np.sum,axis = 1) == 0]
for i in filtered_data._stat_axis.values.tolist():
print('Warning: Discard taxonomy {} for no sample have profile in the profiling table.'.format(i))
data = data[data.apply(np.sum,axis = 1) != 0]
tax = data._stat_axis.values.tolist()
tax.sort()
data = data.reindex(tax)
return data
def tax_split(data1):
samples = data1.columns.values.tolist()
abd_type = data1._stat_axis.values.tolist()
# kpcofgst 8 layers
len_relation = [None, 'k', 'p', 'c', 'o', 'f', 'g', 's', 't']
row_names = {
'k': [], 'p': [], 'c': [], 'o': [], 'f':[], 'g':[], 's': [], 't': []
}
for t in abd_type:
layers = t.split('|')
nlayers = len(layers)
row_names[len_relation[nlayers]].append(t)
# split
merged_data = {}
for i in range(1,9):
layer = len_relation[i]
row_name = row_names[layer]
data = pd.DataFrame(index=row_name, columns=samples)
for type_name in row_name:
for sample in samples:
data.loc[type_name, sample] = data1.loc[type_name, sample]
merged_data[layer] = data.copy()
if merged_data['t'].empty:
del merged_data['t']
return merged_data
# to check input
def check_valid(gf, mergef):
abd_slist = list(mergef.columns)
g_slist = list(gf.index)
valid = True
for s in g_slist:
if s not in abd_slist:
print('Error: profile of sample {} in group information dose NOT exist.'.format(s))
valid = False
for s in list(abd_slist):
if s not in g_slist:
print('Error: group information of sample {} in profile dose NOT exist.'.format(s))
valid = False
# check all-0 sample
if valid:
allzero_sample = mergef.loc[:, (mergef == 0).all(axis=0)]
if not allzero_sample.empty:
for i in allzero_sample._stat_axis.values.tolist():
print('Error: the profiling sum of sample {} is 0.'.format(i))
valid = False
return valid
def auto_decode(ifile):
ec_type = chardet.detect(open(ifile, 'rb').read())['encoding']
data = pd.read_csv(ifile, header=0, sep='\t', index_col = 0,encoding=ec_type)
return data
def metadata2gf(metadata,group_id='phenotype'):
data = auto_decode(metadata)
if group_id in data.columns:
gf = data[[group_id]]
if len(data[group_id].unique()) != 2:
print('Error: the column {} does not have exact 2 level.'.format(group_id))
exit(1)
return gf
else:
print('Error: the column {} does not exists in the metadata input.'.format(group_id))
exit(1)
# main merge function
def get_merged(ifile1,ifile2):
if ifile1 in ['', None, 'None'] and ifile2 in ['', None, 'None']:
print('Error: please input profile.')
exit(1)
elif ifile1 in ['', None, 'None']:
merged_df = auto_decode(ifile2)
elif ifile2 in ['', None, 'None']:
merged_df = auto_decode(ifile1)
else:
merged_df = pd.merge(auto_decode(ifile1),auto_decode(ifile2),how='outer',left_index=True,right_index=True).fillna(0)
merged_df = delete_zero_row(merged_df)
merged_df.index.name = 'Taxonomy'
return merged_df
# convert mpa to newick tree
def tree(): return defaultdict(tree)
def dicts(tree):
return {key: (dicts(tree[key]) if hasattr(tree[key], 'items') else tree[key]) for key in tree}
def newickify(node_to_children, root_node, dist) -> str:
visited_nodes = set()
def newick_render_node(name, distance: float) -> str:
assert name not in visited_nodes, "Error: The tree may not be circular!"
if name not in node_to_children:
# Leafs
return F'{name}:{distance}'
else:
# Nodes
visited_nodes.add(name)
children = node_to_children[name]
children_strings = [newick_render_node(child, children[child]) for child in children.keys()]
children_strings = ",".join(children_strings)
return F'({children_strings}){name}:{distance}'
newick_string = newick_render_node(root_node, dist) + ';'
# Ensure no entries in the dictionary are left unused.
assert visited_nodes == set(node_to_children.keys()), "Error: some nodes aren't in the tree"
return newick_string
def map2newick(dat, dist):
tree_hash = tree()
for i in dat.index:
tmp = i.split('|')
tree_hash['root'][tmp[0]] = dist
for n in range(1,len(tmp)):
tree_hash[tmp[n-1]][tmp[n]] = dist
newick_tree = newickify(tree_hash, 'root', dist)
return newick_tree
def filter_metadata_att(metadata,group_id='phenotype'):
data = auto_decode(metadata)
if group_id in data.columns:
if len(data[group_id].unique()) != 2:
print('Error: the column {} does not have exact 2 level.'.format(group_id))
exit(1)
else:
print('Error: the column {} does not exists in the metadata input.'.format(group_id))
exit(1)
outputlist=[]
for att in data.columns:
if data[att].dtypes == object:
l = len(data[att].unique())
if l != 1 and l <= 10:
outputlist.append(att)
else:
print('Warning: Discard metadata attribute {} for the level number of this attribute equal to 1 or greater than 10.'.format(att))
else:
if len(data[att].unique()) != 1:
outputlist.append(att)
if len(outputlist) == 10:
print('Warning: Too many attributes in metadata, reserved the first 10 attributes in the output.')
break
tmp = data[outputlist]
return tmp