-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathSEQ_to_A3Ms.sh
executable file
·347 lines (314 loc) · 9.82 KB
/
SEQ_to_A3Ms.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#!/bin/bash
#---------------------------------------------------------#
##### ===== All functions are defined here ====== #########
#---------------------------------------------------------#
# ----- usage ------ #
usage()
{
echo "SEQ_to_A3Ms.sh v1.00 [Dec-30-2019] "
echo " Generate a large set of A3Ms from a given FASTA sequence by certain strategies. "
echo " [note]: the strategy string shall be 'Evalue:Iteration:AddiMeff|...|...', set 'null' to omit. "
echo ""
echo "USAGE: ./SEQ_to_A3Ms.sh <-i input_fasta> [-x UC_strategy] [-y UR_strategy] [-z NR_strategy] "
echo " [-o out_root] [-c CPU_num] [-X UC_database] [-Y UR_database] [-z NR_database] "
echo " [-M min_cut] [-N max_num] [-V addi_eval] [-D addi_db] [-K remove_tmp] [-f force] [-H home] "
echo "Options:"
echo ""
echo "***** required arguments *****"
echo "-i input_fasta : The input sequence in FASTA format. "
echo ""
echo "***** strategy arguments *****"
echo "#--| strategy string"
echo "-x UC_strategy : The strategy string for HHblits against UniClust30. [default = '1e-3:3:6|1:3:6'] "
echo ""
echo "-y UR_strategy : The strategy string for JackHMMER against UniRef90. [default = '1e-3:3:6|1e-5:3:6'] "
echo ""
echo "-z NR_strategy : The strategy string for BuildAli2 against NR90+NR70. [default = '1e-3:5:6|1:5:6'] "
echo ""
echo "#--| relevant database"
echo "-X UC_database : The UniClust30 database for HHblits. [default = uniclust30 ] "
echo ""
echo "-Y UR_database : The UniRef90 database for JackHMMER. [default = uniref90 ] "
echo ""
echo "-Z NR_database : The NR90+NR70 database for BuildAli2. [default = nr_databases ] "
echo ""
echo "***** optional arguments *****"
echo "#--| general options"
echo "-o out_root : Default output would the current directory. [default = './\${input_name}_MultA3Ms'] "
echo ""
echo "-c CPU_num : CPU number. [default = 4] "
echo ""
echo "#--| filter strategy"
echo "-M min_cut : Minimal coverage of sequences in the generated MSA. [default = -1] "
echo " -1 indicates that we DON'T perform any filtering. Please set from 50 to 70. "
echo ""
echo "-N max_num : Maximal number of sequences in the generated MSA. [default = 20000] "
echo " -1 indicates that we DON'T perform any filtering. By default, we set 20000 here. "
echo ""
echo "#--| additional A3M"
echo "-V addi_eval : run additional A3M with a given e-value. [default = 0.001] "
echo ""
echo "-D addi_db : run additional A3M using a given database. [default = metaclust50] "
echo ""
echo "#--| other options"
echo "-K remove_tmp : Remove temporary folder or not. [default = 1 to remove] "
echo ""
echo "-f force : If specificied, then FORCE overwrite existing files. [default = 0 NOT to] "
echo ""
echo "***** home relevant roots ******"
echo "-H home : home directory of SEQ_to_A3Ms.sh "
echo " [default = `dirname $0`]"
echo ""
exit 1
}
#-------------------------------------------------------------#
##### ===== get pwd and check HomeDirectory ====== #########
#-------------------------------------------------------------#
#------ current directory ------#
curdir="$(pwd)"
#-------- check usage -------#
if [ $# -lt 1 ];
then
usage
fi
#---------------------------------------------------------#
##### ===== All arguments are defined here ====== #########
#---------------------------------------------------------#
# ----- get arguments ----- #
#-> required arguments
input_fasta=""
#-> search strategy
#--| strategy string
UC_strategy="1e-3:3:6|1:3:6"
UR_strategy="1e-3:3:6|1e-5:3:6"
NR_strategy="1e-3:5:6|1:5:6"
#--| relevant database
UC_database="uniclust30"
UR_database="uniref90"
NR_database="nr_databases"
#-> optional arguments
#--| general options
out_root="" #-> output root
CPU_num=4 #-> default CPU number is 4
#--| filter strategies
min_cut=-1 #-> default is -1. If set, then run Cov_Filter
max_num=20000 #-> default is 20000. If set, then run Meff_Filter
#--| additional a3m
addi_eval="1e-3" #-> default is 0.001
addi_db="metaclust50" #-> can be ANY sequence database in plain text format
#--| others
kill_tmp=1 #-> default: kill temporary root
force=0 #-> default: NOT force overwrite
#-> home relevant
home=`dirname $0` #-> home directory
#-> parse arguments
while getopts "i:x:y:z:X:Y:Z:o:c:M:N:V:D:K:f:H:" opt;
do
case $opt in
#-> required arguments
i)
input_fasta=$OPTARG
;;
#-> strategy arguments
#--| strategy string
x)
UC_strategy=$OPTARG
;;
y)
UR_strategy=$OPTARG
;;
z)
NR_strategy=$OPTARG
;;
#--| relevant database
X)
UC_database=$OPTARG
;;
Y)
UR_database=$OPTARG
;;
Z)
NR_database=$OPTARG
;;
#-> optional arguments
#--| general options
o)
out_root=$OPTARG
;;
c)
CPU_num=$OPTARG
;;
#--| filter strategies
M)
min_cut=$OPTARG
;;
N)
max_num=$OPTARG
;;
#--| additional a3m
V)
addi_eval=$OPTARG
;;
D)
addi_db=$OPTARG
;;
#--| others options
K)
kill_tmp=$OPTARG
;;
f)
force=$OPTARG
;;
#-> home relevant
H)
home=$OPTARG
;;
#-> help
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
#---------------------------------------------------------#
##### ===== Part 0: initial argument check ====== #########
#---------------------------------------------------------#
# ------ check home directory ---------- #
if [ ! -d "$home" ]
then
echo "home directory $home not exist " >&2
exit 1
fi
home=`readlink -f $home`
#------- check input_fasta -----------#
if [ -z "$input_fasta" ]
then
echo "input input_fasta is null !!" >&2
exit 1
fi
if [ ! -s "$input_fasta" ]
then
echo "input_fasta $input_fasta not found !!" >&2
exit 1
fi
input_fasta=`readlink -f $input_fasta`
#-> get query_name
fulnam=`basename $input_fasta`
relnam=${fulnam%.*}
# ------ check output directory -------- #
if [ "$out_root" == "" ]
then
out_root=$curdir/${relnam}_MultA3Ms
fi
mkdir -p $out_root
out_root=`readlink -f $out_root`
#------------------------------------------------------#
##### ===== Part 1: Generate multi A3Ms ====== #########
#------------------------------------------------------#
#--------------init step: run strategies --------------------#
echo "#================ init step: run strategies to generate A3Ms =====================#"
for str in UC_strategy UR_strategy NR_strategy
do
#----- initialization -----#
st_code=${str:0:2}
if [ "$st_code" == "UC" ]
then
method=hhsuite2
strategies=$UC_strategy
database=$UC_database
echo "#------------------- step 1: run $method on $database with strategy $UC_strategy -------------------- #"
elif [ "$st_code" == "UR" ]
then
method=jackhmm
strategies=$UR_strategy
database=$UR_database
echo "#------------------- step 2: run $method on $database with strategy $UR_strategy -------------------- #"
else
method=buildali2
strategies=$NR_strategy
database=$NR_database
echo "#------------------- step 3: run $method on $database with strategy $NR_strategy -------------------- #"
fi
#------- check null --------#
if [ "$strategies" != "null" ] && [ "$strategies" != "" ]
then
st_num=`echo $strategies | awk -F"|" '{print NF}'`
#-> for each strategy
for ((i=1;i<=$st_num;i++))
do
#--| get strategy block
strategy=`echo $strategies | cut -d '|' -f $i`
#--| get each item
e_value=`echo $strategy | cut -d ':' -f 1`
iteration=`echo $strategy | cut -d ':' -f 2`
addi_a3m=`echo $strategy | cut -d ':' -f 3`
#--| check final output
outn=${relnam}_${st_code}_${e_value}_${iteration}_${addi_a3m}
if [ -s "$out_root/$outn.a3m" ]
then
continue
fi
#--| create output
outr=$out_root/$outn
mkdir -p $outr
#--| run job
if [ ! -s $outr/$relnam.a3m ] || [ $force -eq 1 ]
then
#---- submit jobs in parallel
$home/A3M_TGT_Gen.sh -i $input_fasta -h $method -d $database -c $CPU_num -e $e_value -n $iteration -A $addi_a3m -C -1 \
-o $outr -M $min_cut -N $max_num -V $addi_eval -D $addi_db -K $kill_tmp -f $force &
#---- submit jobs in consideration of their strategy types
#if [ "$st_code" == "NR" ] #-> for NR, as the memory issue, we HAVE TO run jobs sequentially
#then
# $home/A3M_TGT_Gen.sh -i $input_fasta -h $method -d $database -c $CPU_num -e $e_value -n $iteration -A $addi_a3m -C -1 \
# -o $outr -M $min_cut -N $max_num -V $addi_eval -D $addi_db -K $kill_tmp -f $force &
#else #-> for UC and UR, we MAY run jobs in parallel
# $home/A3M_TGT_Gen.sh -i $input_fasta -h $method -d $database -c $CPU_num -e $e_value -n $iteration -A $addi_a3m -C -1 \
# -o $outr -M $min_cut -N $max_num -V $addi_eval -D $addi_db -K $kill_tmp -f $force &
#fi
fi
done
wait
fi
done
#-------------- final collections -----------------#
echo "#============= final step: collect ALL successfully generated A3Ms ============#"
for str in UC_strategy UR_strategy NR_strategy
do
#-> initialization
st_code=${str:0:2}
strategies=`eval echo \\$$str`
#-> check null
if [ "$strategies" != "null" ] && [ "$strategies" != "" ]
then
st_num=`echo $strategies | awk -F"|" '{print NF}'`
#-> for each strategy
for ((i=1;i<=$st_num;i++))
do
#--| get strategy block
strategy=`echo $strategies | cut -d '|' -f $i`
#--| get each item
e_value=`echo $strategy | cut -d ':' -f 1`
iteration=`echo $strategy | cut -d ':' -f 2`
addi_a3m=`echo $strategy | cut -d ':' -f 3`
#--| get output file
outn=${relnam}_${st_code}_${e_value}_${iteration}_${addi_a3m}
outr=$out_root/$outn
if [ -s $outr/$relnam.a3m ]
then
mv $outr/$relnam.a3m $out_root/$outn.a3m
fi
#--| remove temporary folder
if [ $kill_tmp -eq 1 ]
then
rm -rf $outr
fi
done
fi
done
#======================= exit ====================#
exit 0