-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsplit_corpus.sh
executable file
·53 lines (40 loc) · 960 Bytes
/
split_corpus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/bash
if [ $# -lt 3 ]; then
echo "Usage: `basename $0` corpus_1 corpus_2 l1 l2 n_sents [-s]"
echo "Splits and optionally shuffles the given corpora"
echo "example:`basename $0` training.es training.en 1000"
exit 1
fi
shuffle=0
corpus1=$1
corpus2=$2
l1=$3
l2=$4
nsents=$5
while [ $# -ne 0 ]; do
case $1 in
"--help") usage
exit 0
;;
"-s") shift
shuffle=1
;;
esac
shift
done
destdir=`dirname ${corpus1}`
if [ ${shuffle} -eq 1 ]; then
echo "Shuffling"
paste $corpus1 $corpus2 > /tmp/tr
shuf /tmp/tr > /tmp/tr2
cut -f 1 /tmp/tr2 >/tmp/c1
cut -f 2 /tmp/tr2 >/tmp/c2
corpus1=/tmp/c1
corpus2=/tmp/c2
rm /tmp/tr /tmp/tr2
fi
echo "Splitting"
head -n ${nsents} ${corpus1} > ${destdir}/dev.${l1}
head -n ${nsents} ${corpus2} > ${destdir}/dev.${l2}
tail -n +${nsents} ${corpus1} > ${destdir}/training.${l1}
tail -n +${nsents} ${corpus2} > ${destdir}/training.${l2}