-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalignCorpora.sh
executable file
·38 lines (24 loc) · 1.4 KB
/
alignCorpora.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash
GIZA=/home/lvapeab/smt/software/mgiza/mgizapp/bin
if [ $# -ne 3 ];then
echo "Usage: align_corpora.sh <file.en> <file.es> <dest_dir>"
exit 0
fi
prefix=$3
mkdir -p $prefix
text_1=$1
text_2=$2
name_1=`basename ${text_1}`
name_2=`basename ${text_2}`
echo "Obtaining target and source vocabulary files (.vcb) and sentence pair file (.snt)."
${GIZA}/plain2snt ${text_1} ${text_2} -snt1 ${prefix}/${name_1}_${name_2}.snt -snt2 ${prefix}/${name_2}_${name_1}.snt -vcb1 ${prefix}/${name_1}.vcb -vcb2 ${prefix}/${name_2}.vcb
echo "Done"
echo "Creating coocurrence file"
${GIZA}/snt2cooc ${prefix}/${name_1}_${name_2}.cooc ${prefix}/${name_1}.vcb ${prefix}/${name_2}.vcb ${prefix}/${name_1}_${name_2}.snt
${GIZA}/snt2cooc ${prefix}/${name_2}_${name_1}.cooc ${prefix}/${name_2}.vcb ${prefix}/${name_1}.vcb ${prefix}/${name_2}_${name_1}.snt
echo "Aligning with GIZA++"
echo "One way (${name_1} -> ${name_2})"
${GIZA}/mgiza -s ${prefix}/${name_1}.vcb -t ${prefix}/${name_2}.vcb -c ${prefix}/${name_1}_${name_2}.snt -coocurrencefile ${prefix}/${name_1}_${name_2}.cooc -o ${prefix}/${name_1}_${name_2} -model1dumpfrequency 1
echo "And the other (${name_2} -> ${name_1})"
${GIZA}/mgiza -s ${prefix}/${name_2}.vcb -t ${prefix}/${name_1}.vcb -c ${prefix}/${name_2}_${name_1}.snt -coocurrencefile ${prefix}/${name_2}_${name_1}.cooc -o ${prefix}/${name_2}_${name_1} -model1dumpfrequency 1
echo "Done"