-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTest2.py
76 lines (63 loc) · 2.82 KB
/
Test2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# This file is part of UDPipe <http://github.com/ufal/udpipe/>.
#
# Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import ufal.udpipe
# ufal.udpipe.Model etc. are SWIG-magic and cannot be detected by pylint
# pylint: disable=no-member
class Model:
def __init__(self, path):
"""Load given model."""
self.model = ufal.udpipe.Model.load(path)
if not self.model:
raise Exception("Cannot load UDPipe model from file '%s'" % path)
def tokenize(self, text):
"""Tokenize the text and return list of ufal.udpipe.Sentence-s."""
tokenizer = self.model.newTokenizer(self.model.DEFAULT)
if not tokenizer:
raise Exception("The model does not have a tokenizer")
return self._read(text, tokenizer)
def read(self, text, in_format):
"""Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
if not input_format:
raise Exception("Cannot create input format '%s'" % in_format)
return self._read(text, input_format)
def _read(self, text, input_format):
input_format.setText(text)
error = ufal.udpipe.ProcessingError()
sentences = []
sentence = ufal.udpipe.Sentence()
while input_format.nextSentence(sentence, error):
sentences.append(sentence)
sentence = ufal.udpipe.Sentence()
if error.occurred():
raise Exception(error.message)
return sentences
def tag(self, sentence):
"""Tag the given ufal.udpipe.Sentence (inplace)."""
self.model.tag(sentence, self.model.DEFAULT)
def parse(self, sentence):
"""Parse the given ufal.udpipe.Sentence (inplace)."""
self.model.parse(sentence, self.model.DEFAULT)
def write(self, sentences, out_format):
"""Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""
output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
output = ''
for sentence in sentences:
output += output_format.writeSentence(sentence)
output += output_format.finishDocument()
return output
#Can be used as
model = Model('UniversalPetrarch/preprocessing/udpipe-1.2.0/model/english-ud-2.0-170801.udpipe')
sentences = model.tokenize("Hi there. How are you?")
for s in sentences:
print (s.getText())
model.tag(s)
model.parse(s)
print (parsed)
conllu = model.write(sentences, "conllu")