diff --git a/master/process_italianprep.py b/master/process_italianprep.py new file mode 100644 index 0000000..1b88bd7 --- /dev/null +++ b/master/process_italianprep.py @@ -0,0 +1,40 @@ +import sys +import pymorphit_cls +import csv +import re + +DEBUG = True + + +if __name__ == '__main__': + filename = 'italian2.prep.tab' + lemmatizer = pymorphit_cls.PyMorphITCLS() + + out = open('italian2.prep.withlemmatized.tab', 'wt') + + with open(filename, 'r') as f: + reader = csv.reader(f, dialect='excel-tab') + + for i, line in enumerate(reader): + italian_line = line[3] + + if i == 0: + outheaderlist = line + outheaderlist.insert(4, 'lemmatized') + outheaderline = '\t'.join(outheaderlist).strip() + '\t' + out.write(outheaderline) + else: + italian_line = re.sub(r'[\W]+', ' ', italian_line) + lemmatized_line = lemmatizer.lemmatize_line(italian_line, mode='Q') + + outlist = line + outlist.insert(4, lemmatized_line) + outline = '\t'.join(outlist).strip() + '\n' + out.write(outline) + if DEBUG: + print(italian_line) + print('') + print(lemmatized_line) + print('---') + out.close() + print('Done') \ No newline at end of file diff --git a/master/pymorphit.py b/master/pymorphit.py index 7e5aa1e..257221b 100644 --- a/master/pymorphit.py +++ b/master/pymorphit.py @@ -214,4 +214,4 @@ def lemmatize(lemmaPrec, lessema, succ): if len(lemmabile)>0: lemmaPrec = lemmabile lemmabile = lemmatize(lemmaPrec, t[1], succ) - widx += 1 \ No newline at end of file +widx += 1 diff --git a/master/pymorphit_cls.py b/master/pymorphit_cls.py new file mode 100644 index 0000000..e5d9fda --- /dev/null +++ b/master/pymorphit_cls.py @@ -0,0 +1,346 @@ +# coding: utf-8 + +# by G. De Gasperis and I. Grappasonno, UnivAQ http://www.univaq.it +# adapted by: H. de Vos: hdvos93[at]gmail.com + + +# Packages for technical support +import re +import codecs +import json +import os + +# Packages for linguistic support +from string import punctuation + + +class PyMorphITCLS(object): + def __init__(self): + self.UNKOWN = u'?' + self.DOUBT = u'!' + self.DRULES_FILE = 'drules.json' + self.DEBUG = False + + self.dMorphit = {} + self.dRules = {} + self.catTree = {} + + print('initialize models...', flush=True) + self.initialize_models() + print('initializing done...', flush=True) + + # Initialisation funcs -------------------------------------- + + def catChoice(self): + ''' + Function for the Guided Mode ('G') To choose a category. + :return: None + ''' + global catS + catL = list(catS) + ci = 1 + for i, c in enumerate(catL): + print(i + 1, c) + + a = int(input('Quale? :')) + if (a > 0) and (a <= len(catL)): + return catL[a] + else: + return 'J:j' + + def addCatTree(self, catK, catV): + """ + Function to add category to the CatTree. + :param catK: + :param catV: + :return: None + """ + + if not catK in self.catTree.keys(): + self.catTree[catK] = set([]) + self.catTree[catK] = self.catTree[catK].union(self.catTree[catK], set(catV)) + + + def initialize_models(self): + """ + Initializes (loads) the models into the appropriate data structures. + :return: None + """ + + if os.path.isfile(self.DRULES_FILE): + self.dRules = json.loads(open(self.DRULES_FILE, 'r').read()) + + fm = codecs.open('morph-it_048_utf8.txt', 'r', encoding='utf-8') + + for line in fm: + lst = re.split(' |\t', line.strip()) + cat = lst[2] + + if cat.find('-') > 0: + catL2 = cat.split('-') + self.addCatTree(catL2[0], catL2[1:]) + elif cat.find(':') > 0: + [catL1l, catL1r] = cat.split(':') + if catL1l.find('-') > 0: + [catL2l, catL2r] = catL1l.split('-') + self.addCatTree(catL2l, catL2r) + else: + self.addCatTree(catL1l, '') + else: + self.addCatTree(cat, '') + try: + if lst[0] in self.dMorphit.keys(): + self.dMorphit[lst[0]] += [(lst[1], lst[2])] + else: + self.dMorphit[lst[0]] = [(lst[1], lst[2])] + except: + pass + + # Tokenization funcs -------------------------------------------------------- + + def makeTupleList(self, lx): + """ + Makes proper list of tuples of a tokenized line. + :param lx: or A tokenized line + :return: A list of tuples of form [..., (tokentype, lexeme), ...]. Token type has values like PUNT (punctiation) or LESSEMA (lexeme) + """ + + out = [] + for e in lx: + if type(e) == tuple: + out.append(e) + if type(e) == list: + out += self.makeTupleList(e) + else: + if self.DEBUG: + print('?????') + return out + + def tokenize(self, line): + scanner = re.Scanner([ + (r"[0-9]+", lambda scanner, token: ("DET-NUM", token)), + (r"[A-Z]*[a-z]+[\w]*", lambda scanner, token: ("LESSEMA", token)), + (r"[\w]+", lambda scanner, token: ("LESSEMA", token)), + (r"[!.?]+", lambda scanner, token: ("PUNT_FIN", token)), + (r"[,;:]+", lambda scanner, token: ("PUNT", token)), + (r"\s+", None), # None == skip token. + ], re.UNICODE) + out0 = scanner.scan(line) + + return self.makeTupleList(out0) + + # Lemmatization funcs -------------------------------------------------------- + + def RomanTranslate(self, s): + ''' + Translates a roman number to an integer in the arabic system. + :param s: Roman Number + :return: Decimal equivalent of the roman number + ''' + string = s.upper() + values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "M": 1000} + try: + return sum(map(lambda x: values[x], string)) + except: + return '' + + def isNumber(self, token): + """ + Checks whether a certain string is a number. + :param token: + :return: True if token is a number, False if token is not a number + """ + out = False + if len(token) > 0: + try: + f = float(token) + out = True + except ValueError: + if self.isNumber(str(self.RomanTranslate(token))): + out = True + pass + return out + + def learnLemma(self, lemmaPrec, lessema, succ): + """ + Learns a new lemma pattern and writes it to dMorphit. First it tries automatically to find pattern and otherwise it asks the user for input. + :param lemmaPrec: : tuple containing (preceding lemma, POS-tag) + :param lessema: the target lexeme and type of the lexeme. (type , target word ) + :param succ: . Next word. unlemmatized word succeeding the target word. + :return: . The learnt pattern or an empty string ('') + """ + ltupla = str((lemmaPrec[1], lessema, succ[1])) + if ltupla in self.dRules.keys(): + c = self.dRules[ltupla] + print('regola:', ltupla, c) + return c + + else: + print('NUOVA REGOLA!') + print('Contesto: [..', lemmaPrec[0], lessema, succ[0], '..]') + print('\t\t', '<' + lemmaPrec[1] + '>', lessema, '<' + succ[1] + '>') + print(0, 'Save and Exit') + lemmi = self.dMorphit[lessema] + for m in range(1, len(lemmi) + 1): + print(m, lemmi[m - 1]) + pass + print(-1, 'Categoria generica') + a = int(input('Quale? :')) + + if a > 0: + out = (lemmi[a - 1][0], lemmi[a - 1][1]) + self.dRules[ltupla] = out + return out + else: + if a == -1: + return (lessema, self.UNKOWN) + return '' + + def makeLemma(self, lemmaPrec, lessema, succ): + """ + Learns new lemma and writes it to the rules file. + :param lemmaPrec: : tuple containing (preceding lemma, POS-tag) + :param lessema: the target lexeme and type of the lexeme. (type , target word ) + :param succ: . Next word. unlemmatized word succeeding the target word. + :return: the learnt lemma or an empty string if no lemma has been learnt. + """ + out = self.learnLemma(lemmaPrec, lessema, succ) + if out == '': + open(self.DRULES_FILE, 'w').write(json.dumps(self.dRules)) + exit(0) + open(self.DRULES_FILE, 'w').write(json.dumps(self.dRules)) + + return out + + def hasLemma(self, lessema): + """ + Checks if lemma exists in the database. + :param lessema: lexeme + :return: True if lexeme in data base. False if lexeme not in data base. + """ + + return lessema in self.dMorphit.keys() + + def getLemma(self, lemmaPrec, lessema, succ, mode='G'): + """ + Retreives the lemma for a given lexeme. + :param lemmaPrec: : tuple containing (preceding lemma, POS-tag) + :param lessema: the target lexeme and type of the lexeme. (type , target word ) + :param succ: . Next word. unlemmatized word succeeding the target word. + :param mode: Either 'G' (user guided) or 'Q' (Quick). 'G'- mode is the mode as designed by De Gasperis and I. Grappasonno. + The Q mode is a quick mode for if no user guidance is required. In this case, if the system is uncertain it wil return the original lexeme. + The Q method is quicker as no human input is needed but also less accurate. + :return: (found lemma, POS-tag) + """ + + lemmi = self.dMorphit[lessema] + out = lemmi[0] + if len(lemmi) > 1: + + if lemmaPrec != self.UNKOWN and mode == 'G': + succLemma = self.lemmatize(self.UNKOWN, succ, self.UNKOWN) + out = self.makeLemma(lemmaPrec, lessema, succLemma) # usa regole ed euristiche per determinare il lemma + elif lemmaPrec != self.UNKOWN and mode == 'Q': + out = (lemmi[0][0], self.DOUBT) + else: + out = (lemmi[0][0], self.UNKOWN) + + return out + + def lemmatize(self, lemmaPrec, lessema, succ, mode='G'): + ''' + Lemmatize the target word. + :param lemmaPrec: (lemma, POS-tag) lemmatized word preceding to target word + :param lessema: (type ('LESSEMA (lexeme) or PUNT (Punctuation)), target word) + :param succ: <>unlemmatized word succeeding the target word. + :return: (lemma, POS-tag) + ''' + + out = u'' + if type(lessema) == tuple: + lessema = lessema[1] + + if self.DEBUG: + print(lessema, '...', ) + + if len(lessema) > 0: + if self.hasLemma(lessema): + out = self.getLemma(lemmaPrec, lessema, succ, mode) + elif self.hasLemma(lessema.lower()): + out = self.getLemma(lemmaPrec, lessema.lower(), succ, mode) + elif self.hasLemma(lessema.capitalize()): + out = self.getLemma(lemmaPrec, lessema.capitalize(), succ, mode) + elif self.isNumber(lessema): + out = (u'X', u'DET-NUM') + else: + out = (lessema, self.UNKOWN) + + if lemmaPrec != self.UNKOWN and self.DEBUG: + print('\t-->\t', out) + + return out + + + def lemmatize_line(self, line, mode='G'): + """ + Tokenizes and then Lemmatizes a single line of text. + :param line: a line of text. + :param mode: Either 'G' (user guided) or 'Q' (Quick). 'G'- mode is the mode as designed by De Gasperis and I. Grappasonno. + The Q mode is a quick mode for if no user guidance is required. In this case, if the system is uncertain it wil return the original lexeme. + The Q method is quicker as no human input is needed but also less accurate. + :return: the same line of text, but lemmatized. + """ + lemmaPrec = u'[]' + lemmabile = lemmaPrec + + + lemmalist = [] + tl = self.tokenize(line.strip()) + + for widx, t in enumerate(tl): + if t[0] == 'LESSEMA': + succ = '[]' + if widx < len(tl) - 1: + succ = tl[widx + 1] + + if len(lemmabile) > 0: + lemmaPrec = lemmabile + + lemmabile = self.lemmatize(lemmaPrec, t[1], succ, mode) + + if self.DEBUG: + print('lemma: ', lemmabile[0]) + + lemmalist.append(lemmabile[0]) + + linestr = '{}'.format(' '.join(lemmalist)) + + return linestr + + def lemmatize_file(self, filename='collodi_pinocchio_utf8.txt', mode='G'): + """ + Takes a filename and lemmatizes the file. It writes the lemmatized file to a new file named '.lemmatized.txt' + :param filename: name of the input file that needs to be lemmatized. + :param mode: Either 'G' (user guided) or 'Q' (Quick). 'G'- mode is the mode as designed by De Gasperis and I. Grappasonno. + The Q mode is a quick mode for if no user guidance is required. In this case, if the system is uncertain it wil return the original lexeme. + The Q method is quicker as no human input is needed but also less accurate. + :return: None + """ + outfile = ''.join(filename.split('.')[:-1]) + '.lemmatized.txt' + + out = open(outfile, 'wt') + + with codecs.open(filename, 'r', 'utf-8') as fc: + for line in fc: + lemmatized_line = self.lemmatize_line(line, mode) + + out.write(lemmatized_line) + + out.close() + print('Lemmatized file saved as {}'.format(outfile)) + + +if __name__ == '__main__': + lemmatizer = PyMorphITCLS() + lemmatizer.DEBUG = True + lemmatizer.lemmatize_file(mode='Q') diff --git a/master/pymorphit_py3_6.py b/master/pymorphit_py3_6.py new file mode 100644 index 0000000..c887372 --- /dev/null +++ b/master/pymorphit_py3_6.py @@ -0,0 +1,294 @@ +# coding: utf-8 + +# by G. De Gasperis and I. Grappasonno, UnivAQ http://www.univaq.it +# adapted by: H. de Vos + +#package for debug +import time + +#Packages for technical support +import re +import codecs +import json +import os +import sys + +#Packages for linguistic support +from string import punctuation + +# Globals +UNKOWN = u'?' + +# Initialisation funcs -------------------------------------- + +def catChoice(): + global catS + catL = list(catS) + ci = 1 + for i, c in enumerate(catL): + print(i+1, c) + ci += 1 + ci += 1 + a = int(input('Quale? :')) + if (a > 0) and (a <= len(catL)): + return catL[a] + else: + return 'J:j' + + +def addCatTree(catK, catV): + global catTree + if not catK in catTree.keys(): + catTree[catK] = set([]) + catTree[catK] = catTree[catK].union(catTree[catK], set(catV)) + pass + + +def makeTupleList(lx): + out = [] + for e in lx: + if type(e) == type((0, 0)): + out.append(e) + elif type(e) == type([]): + out += makeTupleList(e) + else: + print('?????') + return out + +# Tokenization funcs -------------------------------------------------------- + +def tokenize(line): + scanner = re.Scanner([ + (r"[0-9]+", lambda scanner, token: ("DET-NUM", token)), + (r"[A-Z]*[a-z]+[àèéìòù]*", lambda scanner, token: ("LESSEMA", token)), + (r"[!.?]+", lambda scanner, token: ("PUNT_FIN", token)), + (r"[,;:]+", lambda scanner, token: ("PUNT", token)), + (r"\s+", None), # None == skip token. + ]) + out0 = scanner.scan(line) + return makeTupleList(out0) + + +# Lemmatization funcs -------------------------------------------------------- + +def RomanTranslate(s): + ''' + + :param s: Roman Number + :return: Decimal equivalent of the roman number + ''' + string = s.upper() + values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "M": 1000} + try: + return sum(map(lambda x: values[x], string)) + except: + return '' + + +def isNumber(token): + out = False + if len(token) > 0: + try: + f = float(token) + out = True + except ValueError: + if isNumber(str(RomanTranslate(token))): + out = True + pass + return out + +''' +I am not sure what this part does. +However, it works perfectly fine without. So I commented it out. + +def log(s): + if type(s) in [type(u''), type('')]: + return s.encode('utf-8') + else: + out = '' + for z in s: + out += '\t' + log(str(z).encode('utf-8')) + return out +''' + +def learnLemma(lemmaPrec, lessema, succ): + global dMorphit, dRules + ltupla = str((lemmaPrec[1], lessema, succ[1])) + if ltupla in dRules.keys(): + c = dRules[ltupla] + print ('regola:', ltupla, c) + return c + + else: + print('NUOVA REGOLA!') + print ('Contesto: [..', lemmaPrec[0], lessema, succ[0],'..]') + print ('\t\t', '<'+lemmaPrec[1]+'>', lessema, '<'+succ[1]+'>') + print(0, 'Save and Exit') + lemmi = dMorphit[lessema] + for m in range(1, len(lemmi) + 1): + print (m, lemmi[m-1]) + pass + print(-1, 'Categoria generica') + a = int(input('Quale? :')) + + if a > 0: + out = (lemmi[a - 1][0], lemmi[a - 1][1]) + dRules[ltupla] = out + return out + else: + if a == -1: + return (lessema, UNKOWN) + return '' + + +def makeLemma(lemmaPrec, lessema, succ): + out = learnLemma(lemmaPrec, lessema, succ) + if out == '': + open(DRULES_FILE, 'w').write(json.dumps(dRules)) + exit(0) + return out + + +def hasLemma(lessema): + + global dMorphit + return lessema in dMorphit.keys() + + +def getLemma(lemmaPrec, lessema, succ): + + out = u'' #<-- redundant. + lemmi = dMorphit[lessema] + out = lemmi[0] + if len(lemmi) > 1: + # ambiguità + if lemmaPrec != UNKOWN: + succLemma = lemmatize(UNKOWN, succ, UNKOWN) + out = makeLemma(lemmaPrec, lessema, succLemma) # usa regole ed euristiche per determinare il lemma + else: + out = (lemmi[0][0], UNKOWN) + return out + + +def lemmatize(lemmaPrec, lessema, succ): + ''' + Lemmatize the target word. + :param lemmaPrec: (lemma, POS-tag) lemmatized word preceding to target word + :param lessema: (type ('LESSEMA (lexeme) or PUNT (Punctuation)), target word) + :param succ: <>unlemmatized word succeeding the target word. + :return: (lemma, POS-tag) + ''' + + out = u'' + if type(lessema) == tuple: + lessema = lessema[1] + + print (lessema, '...',) + if len(lessema) > 0: + if hasLemma(lessema): + out = getLemma(lemmaPrec, lessema, succ) + elif hasLemma(lessema.lower()): + out = getLemma(lemmaPrec, lessema.lower(), succ) + elif hasLemma(lessema.capitalize()): + out = getLemma(lemmaPrec, lessema.capitalize(), succ) + elif isNumber(lessema): + out = (u'X', u'DET-NUM') + else: + out = (lessema, UNKOWN) + if lemmaPrec != UNKOWN: + print ('\t-->\t', out) + return out + + +# ------------ MAIN ----------------- +if __name__ == '__main__': + + #INFILE = sys.argv[1] #<-- uncomment after debugging + INFILE = 'collodi_pinocchio_utf8.txt' #<-- comment after debugging + + tl = tokenize(u'Il turista che andò al mare, e mai più ci Tornerà.') + print(tl) + pass + myPuntuaction = punctuation + myPuntuaction = myPuntuaction.replace('-', '') + r = re.compile(r'[\s{}]+'.format(re.escape(myPuntuaction))) + + dMorphit = {} + dRules = {} + catTree = {} + + DRULES_FILE = 'drules.json' + if os.path.isfile(DRULES_FILE): + dRules = json.loads(open(DRULES_FILE, 'r').read()) + + # DMORPHIT_FILE = 'dmorphit.json' + # if os.path.isfile(DMORPHIT_FILE): + # dMorphit = json.loads(open(DMORPHIT_FILE,'r').read()) + # else: + + fm = codecs.open('morph-it_048_utf8.txt', 'r', encoding='utf-8') + + for line in fm: + lst = re.split(' |\t', line.strip()) + cat = lst[2] + + if cat.find('-') > 0: + catL2 = cat.split('-') + addCatTree(catL2[0], catL2[1:]) + elif cat.find(':') > 0: + [catL1l, catL1r] = cat.split(':') + if catL1l.find('-') > 0: + [catL2l, catL2r] = catL1l.split('-') + addCatTree(catL2l, catL2r) + else: + addCatTree(catL1l, '') + else: + addCatTree(cat, '') + try: + if lst[0] in dMorphit.keys(): + dMorphit[lst[0]] += [(lst[1], lst[2])] + else: + dMorphit[lst[0]] = [(lst[1], lst[2])] + except: + pass + # open(DMORPHIT_FILE,'w').write(json.dumps(dMorphit)) + + print('initiation done...', flush=True) + time.sleep(5) + + with codecs.open(INFILE, 'r', 'utf-8') as fc: + lemmaPrec = u'[]' + lemmabile = lemmaPrec + succ = u'' + + for line in fc: + + tl = tokenize(line.strip()) + + widx = 0 + + for widx2, t in enumerate(tl): + + if t[0] == 'LESSEMA': + succ = '[]' + if widx < len(tl) - 1: + succ = tl[widx2 + 1] + + if len(lemmabile) > 0: + lemmaPrec = lemmabile + + + print('-------------------') + print(tl) + + print('lemma:', lemmabile) + + print('lemmaprec:', lemmaPrec) + print('t1:', t[1]) + print('succ:', succ) + + + lemmabile = lemmatize(lemmaPrec, t[1], succ) + widx += 1 + + print('-------------------')