-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmerge-files-col.sh
executable file
·228 lines (201 loc) · 5.77 KB
/
merge-files-col.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/bin/bash
# Merge single columns from multiple files using standard command line
# utilities and additionally the custom 'transpose' program from the
# micans/reaper respository.
#
# Several checks (e.g. row name identiy) and options are available, see the -h
# output.
# Intermediate output and final output are both gzipped.
#
# This program uses cut(1) to isolate the columns; it writes each sequentially
# using paste(1) as a single line and appends the lines. This leads to a
# result which is the transposed of what is needed; In terms of a standard
# application (input columns are gene counts across multiple conditions) output
# row names now correspond to samples and column names correspond to gene names.
#
# The final result is obtained by using the custom program 'transpose', which
# is a C implementation of the needed transpose operation This combined
# approach is generally a lot faster than using R to load tables and cbind to
# combine the columns into a new table.
#
# TODO: clean-up of extra files; perhaps work in /tmp.
#
set -euo pipefail
# For these options, see the documentation under -h
# for a description of what they do.
basename=out
test_tp=false
header=true
rowdatastart=2
rownames_check=true
help=false
col=2
colname=
colname_expected=
stripright=
stripleft=
output=
skip=0
verbose=false
function clean_up {
thestatus=$?
if $help; then
true
elif [[ $thestatus == 0 ]]; then
>&2 echo "[happy] Script succeeded - gzipped output in $output"
if [[ -e "$tpfile.TP" ]]; then
rm "$tpfile.TP"
>&2 echo "removed file $tpfile.TP after successful test"
fi
else
>&2 echo "[grumpy] Script failed"
fi
}
## register the function clean_up to be run when script exits,
## after all the other code in this script.
## At the moment it does not do much.
##
trap clean_up INT TERM EXIT
while getopts :c:C:E:b:o:s:x:y:VLTHh opt
do
case "$opt" in
b)
basename=$OPTARG
;;
c)
col=$OPTARG
;;
x)
stripleft=$OPTARG
;;
y)
stripright=$OPTARG
;;
C)
colname=$OPTARG
;;
H)
header=false
rowdatastart=1
;;
E)
colname_expected=$OPTARG
;;
o)
output=$OPTARG
;;
T)
test_tp=true
;;
L)
rownames_check=false
;;
V)
verbose=true
;;
s)
skip=$(($OPTARG+1))
;;
h)
cat <<EOU
Usage: merge-col-files.sh [ -c <NUM> ] [ MORE OPTIONS ] FILES
Options:
-b basename basename for output (default $basename)
-c <NUM> column index (default 2)
-C <colname> rename column to <colname> (default taken from column head)
-E <check> require file column name to be <check> (mnemonic: expect)
Caveat: currently this check is only performed for the first file in the list.
-o <name> write output to name (default: <basename>.<colname>.txt.gz)
-x pattern for result column name, strip pattern from the left of file name
-y pattern for result column name, strip pattern from the right of file name
-T test use of the transpose program (re-transpose, compare with original)
-L lax; do not test whether row names (first column) are identical between files
-V verbosity on
-s <num> skip first <num> lines
EOU
help=true
exit
;;
:) >&2 echo "Flag $opt needs argument"
exit 1;;
?) >&2 echo "Flag $OPTARG unknown"
exit 1;;
esac
done
OPTIND=$(($OPTIND-1))
shift $OPTIND
if [[ $# == 0 ]]; then
>&2 echo "Error (no files provided)"
false
fi
firstfile=$1
if $header; then
# get the column labels of interest, e.g. gene counts.
#
read -a colnames <<EOF
$(tail -n +$skip "$firstfile" | head -n 1 | cut -f $col)
EOF
>&2 echo "Found column name ${colnames[0]}"
if [[ ! -z "$colname" ]]; then
>&2 echo "Using supplied column name $colname"
else
colname=${colnames[0]}
fi
if [[ ! -z "$colname_expected" && "$colname_expected" != "$colname" ]]; then
>&2 echo "Expected column name $colname_expected but found $colname"
false
fi
else
colname=col${col}
fi
tpfile=$basename.$colname.TP.gz
dstfile=$basename.$colname.txt.gz
rownamefile=$basename.$colname.rn
idx=1
# Get the row labels of interest, e.g. gene names.
# Save them for comparison with rownames in later files,
# and write them to the destination file (gzipped).
#
(tail -n +$skip "$firstfile" | cut -f 1 | tee "$rownamefile" | paste -s | gzip) > "$tpfile"
#
# Note: below we append to tpfile (after gzipping). This is OK because
# zipfiles can be concatenated.
(
for f in $@; do
level=$f
if [[ ! -z "$stripleft" ]]; then
level=${level#$stripleft}
fi
if [[ ! -z "$stripright" ]]; then
level=${level%$stripright}
fi
if $verbose; then
>&2 printf "%5d $level\n" $idx
fi
idx=$(($idx+1))
if $rownames_check && ! cmp -s "$rownamefile" <(tail -n +$skip $f | cut -f 1) ; then
>&2 echo "rownames check failed when comparing $firstfile and $f"
false
fi
(echo "$level"; cut -f $col $f | tail -n +$skip | tail -n +$rowdatastart) | transpose -i - --nozip -o -
done
) | gzip >> $tpfile
if [[ -z "$output" ]]; then
output=$dstfile
fi
transpose -i $tpfile -o $output
if $test_tp; then
transpose -i "$output" -o "$tpfile.TP"
files="$tpfile and $tpfile.TP"
if ! cmp -s <(zcat "$tpfile") <(zcat "$tpfile.TP"); then
>&2 echo "Files $files are not byte-identical"
false
else
>&2 echo "Test passed - $files are byte-identical"
fi
fi
# Todo:
# - improve tpfile dstfile rownamefile; enable directories and play
# nicely with -o option. e.g. -o output/base
# - -s option exists, perhaps also skip initial regex (e.g. initial lines
# starting with #