-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess-corpora.py
45 lines (33 loc) · 1.3 KB
/
preprocess-corpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import truecase
import nltk.data
from nltk.tokenize import word_tokenize
import sys
WABLIEFT_PATH="data/wablieft-sents.txt"
DUTCHCORPUS_PATH="data/dutchcorpus-sents.txt"
TESTDUTCHCORPUS_PATH="data/dutchcorpus-sents-test.txt"
WABLIEFT_OUTPUT_PATH="data/wablieft-cleaned-sents.txt"
DUTCHCORPUS_OUTPUT_PATH="data/dutchcorpus-cleaned-sents.txt"
TESTDUTCHCORPUS_OUTPUT_PATH="data/dutchcorpus-test-cleaned.txt"
def tokenize_and_output(ifile, ofile, tokenizer):
for line in ifile:
for sentence in tokenizer.tokenize(line):
tokens = word_tokenize(sentence)
output = " ".join(tokens)
ofile.write(output)
ofile.write("\n")
def main():
sent_tokenizer = nltk.data.load('tokenizers/punkt/dutch.pickle')
output_file = open(WABLIEFT_OUTPUT_PATH, 'w')
with open(WABLIEFT_PATH) as f:
tokenize_and_output(f, output_file, sent_tokenizer)
output_file.close()
output_file = open(DUTCHCORPUS_OUTPUT_PATH, 'w')
with open(DUTCHCORPUS_PATH) as f:
tokenize_and_output(f, output_file, sent_tokenizer)
output_file.close()
output_file = open(TESTDUTCHCORPUS_OUTPUT_PATH, 'w')
with open(TESTDUTCHCORPUS_PATH) as f:
tokenize_and_output(f, output_file, sent_tokenizer)
output_file.close()
if __name__ == "__main__":
main()