-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathugc_norm.sh
executable file
·100 lines (86 loc) · 2.61 KB
/
ugc_norm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
if [ $# != 2 ]
then
echo 'Passar o diretorio de entrada com txt para normalizar e o diretorio de saida'
exit 255
fi
# Configuracao
TOKENIZER=$PWD/tokenizer/webtok
SPELLER_DIR=$PWD/speller
SPELLER_ARGS=
INPUT_DIR=$1
OUTPUT_DIR=$2
export PERL5LIB=$SPELLER_DIR
export PYTHONPATH=$SPELLER_DIR
# get absolute path of input and output dirs
INPUT_DIR=`readlink -f $INPUT_DIR`
OUTPUT_DIR=`readlink -f $OUTPUT_DIR`
# Processamento
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
rm -rf $OUTPUT_DIR
mkdir $OUTPUT_DIR
# tokenizador
##################################################
echo
echo "###"
echo
echo "Aplicando tokenizador em $INPUT_DIR/"
rm -rf $OUTPUT_DIR/tok
mkdir $OUTPUT_DIR/tok
for f in `find $INPUT_DIR/ -name "*.txt"`
do
$TOKENIZER < $f > $OUTPUT_DIR/tok/`basename $f`
done
# speller
##################################################
echo
echo "###"
echo
echo "Aplicando speller em $OUTPUT_DIR/tok"
rm -rf $OUTPUT_DIR/tok/checked
mkdir $OUTPUT_DIR/tok/checked
perl $SPELLER_DIR/spell.pl -stat $SPELLER_DIR/lexicos/regra+cb_freq.txt -d $OUTPUT_DIR/tok
# normalizador de siglas
##################################################
echo
echo "###"
echo
echo "Normalizando siglas em $OUTPUT_DIR/tok/checked/"
rm -rf $OUTPUT_DIR/tok/checked/siglas
mkdir $OUTPUT_DIR/tok/checked/siglas
for f in `find $OUTPUT_DIR/tok/checked -type f`
do
perl ./siglas_map.pl ./resources/lexico_siglas.txt $f > $OUTPUT_DIR/tok/checked/siglas/`basename $f`
done
# normalizador de Internetes
##################################################
echo
echo "###"
echo
echo "Normalizando internetes em $OUTPUT_DIR/tok/checked/siglas"
rm -rf $OUTPUT_DIR/tok/checked/siglas/internetes
mkdir $OUTPUT_DIR/tok/checked/siglas/internetes
for f in `find $OUTPUT_DIR/tok/checked/siglas -type f`
do
perl ./internetes_map.pl ./resources/lexico_internetes.txt ./resources/lexico_internetes_sigl_abrv.txt $f > $OUTPUT_DIR/tok/checked/siglas/internetes/`basename $f`
done
# normalizador de Nome Proprio
##################################################
echo
echo "###"
echo
echo "Normalizando nomes proprios em $OUTPUT_DIR/tok/checked/siglas/internetes"
rm -rf $OUTPUT_DIR/tok/checked/siglas/internetes/nomes
mkdir $OUTPUT_DIR/tok/checked/siglas/internetes/nomes
for f in `find $OUTPUT_DIR/tok/checked/siglas/internetes -type f`
do
perl ./np_map.pl ./resources/lexico_nome_proprio.txt $f > $OUTPUT_DIR/tok/checked/siglas/internetes/nomes/`basename $f`
done
# caixa alta para palavras precedidas por ponto final
##################################################
for f in `find $OUTPUT_DIR/tok/checked/siglas/internetes/nomes -type f`
do
python ./upper_periods.py $f
done
IFS=$SAVEIFS