diff --git a/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe b/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe
new file mode 100644
index 0000000..7f16e14
Binary files /dev/null and b/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe differ
diff --git a/requirements.txt b/requirements.txt
index 523768a..39d6d74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,11 @@
-requests==2.25.1
-nltk==3.5
-numpy~=1.19.2
-beautifulsoup4==4.9.3
-corpy==0.3.0
-Flask==1.1.2
-gensim==3.8.3
-pymysql==1.0.2
-pysolr-3.9.0
-mysql~=5.7.24
+requests==2.25.1
+nltk==3.5
+numpy~=1.19.2
+beautifulsoup4==4.9.3
+corpy==0.3.0
+Flask==1.1.2
+gensim==3.8.3
+pymysql==1.0.2
+pysolr-3.9.0
+mysql~=5.7.24
scikit-learn~=0.24.1
\ No newline at end of file
diff --git a/src/app.py b/src/app.py
index ded21ae..d5f4461 100644
--- a/src/app.py
+++ b/src/app.py
@@ -8,13 +8,11 @@
from src.service import AppService
from flask import Flask, render_template, request, redirect, url_for, flash
-
app = Flask(__name__)
# TODO: need to change with the selection different language
appService = AppService()
-
@app.route('/')
def index():
"""
@@ -23,7 +21,6 @@ def index():
"""
return render_template('index.html')
-
@app.route('/find', methods=['POST'])
def find():
"""
@@ -45,7 +42,6 @@ def find():
"sel_word": sel_word,
"sel_result": appService.sel_result})
-
@app.route('/find2', methods=['POST'])
def find2():
language_name, sel_word = None, None
@@ -59,7 +55,6 @@ def find2():
"sel_word": sel_word,
"sel_result": appService.sel_result})
-
@app.route('/cluster', methods=['POST'])
def cluster():
"""
@@ -76,13 +71,8 @@ def cluster():
if not appService.udt_pre_model:
appService.config_udpipe(language_name)
cluster_model_file = word2vec_language[language_name]
- cluster_result, rec_cluster_result = appService.cluster_sentences(
- language_name, cluster_model_file, cluster_input_sentence, cluster_number)
- return render_template('cluster.html',
- cluster_number=cluster_number,
- cluster_result=cluster_result,
- rec_cluster_result=rec_cluster_result)
-
+ cluster_result, rec_cluster_result = appService.cluster_sentences(language_name, cluster_model_file,cluster_input_sentence, cluster_number)
+ return render_template('cluster.html',cluster_number=cluster_number,cluster_result=cluster_result,rec_cluster_result=rec_cluster_result)
if __name__ == '__main__':
- app.run(port=3000, debug=True)
\ No newline at end of file
+ app.run(port=3000, debug=True)
diff --git a/src/databaseClustering.py b/src/databaseClustering.py
index 34a29ae..57545f4 100644
--- a/src/databaseClustering.py
+++ b/src/databaseClustering.py
@@ -1,4 +1,4 @@
-import mysql.connector
+import pymysql.connector
from mysql.connector import errorcode
from datetime import datetime
import pandas as pd
@@ -11,32 +11,44 @@
from util import db_config
-
def train_model(language_name, corpus_path, save_path):
-
- model = gensim.models.Word2Vec(sentences=corpus_path,
- size=150,
- window=8,
- min_count=2,
- workers=2,
- iter=10)
- model.save(save_path + language_name)
- print('Save succeed')
+ model = gensim.models.Word2Vec(sentences=corpus_path,
+ size=150,
+ window=8,
+ min_count=2,
+ workers=2,
+ iter=10)
+ model.save(save_path + language_name)
+ print('Save succeed')
def load_model(save_path) -> gensim.models.Word2Vec:
- filename = save_path
- model = gensim.models.Word2Vec.load(filename)
- print('Loading succeed')
- for index, word in enumerate(model.wv.index2word):
- if index == 5:
- break
- vec = ",".join(map(lambda i: str(i), model.wv[word]))
- print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
- return model
-
-def database():
-
+ filename = save_path
+ model = gensim.models.Word2Vec.load(filename)
+ print('Loading succeed')
+ for index, word in enumerate(model.wv.index2word):
+ if index == 5:
+ break
+ vec = ",".join(map(lambda i: str(i), model.wv[word]))
+ print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
+ return model
+
+
+# def database():
+# db = mysql.connector.connect(
+# host='localhost',
+# user='root',
+# password='root',
+# database='psd_project'
+# )
+# mycursor = db.cursor()
+# query_info = ("SELECT sentence FROM english_sentences")
+# mycursor.execute(query_info)
+# sentences_df = pd.DataFrame(mycursor.fetchall(), columns=['Sentences'])
+
+# return sentences_df
+
+#either 37- 48 lines code or 52-63 code needed to be there
db = mysql.connector.connect(
host=db_config['host'],
user=db_config['user'],
@@ -50,76 +62,80 @@ def database():
return sentences_df
+
def textProcessing(text):
- no_stop =[words for words in text.split() if words.lower() not in string.punctuation]
+ no_stop = [words for words in text.split() if words.lower() not in string.punctuation]
return no_stop
-def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int) :
-
- n_clusters = int(n_clusters)
- print("clusters are ",n_clusters)
- if n_clusters <=0:
- print("Parameter is Invalid")
- return
- if n_clusters > len(sentences):
- # TODO add log
- print('number of cluster bigger than sentences count')
- return
- # first loading model
- word2vec_model = load_model(save_path)
- # second geting vectors for one sentence
- sent_vectors = []
- default_dimn = 100
- # iterator to sentence
- for word1 in sentences:
- print(word1)
- word_vectors = []
- for words in word1:
-
- if words in word2vec_model.wv:
- word_vectors.append(word2vec_model.wv[words])
- else: # not in dict, fill 0
- word_vectors.append([0] * default_dimn)
-
- to_array = np.array(word_vectors)
- sent_vectors.append(to_array.mean(axis=0).tolist())
- kmeans = KMeans(n_clusters=n_clusters,random_state=0).fit(sent_vectors)
- labels = kmeans.labels_
- tmp_labels,examples = [],[]
- for sent,label in zip(sentences,labels):
- if label not in tmp_labels:
- tmp_labels.append(label)
- examples.append(sent)
- if len(examples) == n_clusters:
- break
- # add bottom logic for cluster
- if len(examples) < n_clusters:
- for sent in sentences:
- if sent not in examples:
- examples.append(sent)
- if len(examples) >= n_clusters:
- break
-
- return examples
+
+def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int):
+ n_clusters = int(n_clusters)
+ print("clusters are ", n_clusters)
+ if n_clusters <= 0:
+ print("Parameter is Invalid")
+ return
+ if n_clusters > len(sentences):
+ # TODO add log
+ print('number of cluster bigger than sentences count')
+ return
+ # first loading model
+ word2vec_model = load_model(save_path)
+ # second geting vectors for one sentence
+ sent_vectors = []
+ default_dimn = 100
+ # iterator to sentence
+ for word1 in sentences:
+ print(word1)
+ word_vectors = []
+ for words in word1:
+
+ if words in word2vec_model.wv:
+ word_vectors.append(word2vec_model.wv[words])
+ else: # not in dict, fill 0
+ word_vectors.append([0] * default_dimn)
+
+ to_array = np.array(word_vectors)
+ sent_vectors.append(to_array.mean(axis=0).tolist())
+ kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sent_vectors)
+ labels = kmeans.labels_
+ tmp_labels, examples = [], []
+ for sent, label in zip(sentences, labels):
+ if label not in tmp_labels:
+ tmp_labels.append(label)
+ examples.append(sent)
+ if len(examples) == n_clusters:
+ break
+ # add bottom logic for cluster
+ if len(examples) < n_clusters:
+ for sent in sentences:
+ if sent not in examples:
+ examples.append(sent)
+ if len(examples) >= n_clusters:
+ break
+
+ return examples
a = database()
+
+# file_path = r'C:\Users\haris\Desktop\wordFinder\word2vec'
+# file_path = file_path + 'English'
+
file_path = './corpus/word2vecmodel/'
language_name = 'english'
file_path = file_path + language_name
load_model(file_path)
print('All done')
-c=a['Sentences'].apply(textProcessing)
+c = a['Sentences'].apply(textProcessing)
# get word vector for one sentence
sentences = [
- 'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
- 'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
- 'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
- 'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']
+ 'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
+ 'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
+ 'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
+ 'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']
cluster_result = cluster_sentences(language_name, file_path,c,3)
print("two examples sentences: \n")
print(cluster_result)
-
diff --git a/src/service.py b/src/service.py
index 7b4e6bd..e826570 100644
--- a/src/service.py
+++ b/src/service.py
@@ -80,7 +80,7 @@ def database(self):
db_config['host'],
db_config['database'])
self.cursor = self.store_data.db_connect().cursor()
- query_info = "SELECT sentence FROM english_sentences"
+ query_info = "SELECT sentence FROM English_sentences"
self.cursor.execute(query_info)
sentences_df = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
return sentences_df
@@ -91,7 +91,7 @@ def clusteringData(self):
db_config['host'],
db_config['database'])
self.cursor = self.store_data.db_connect().cursor()
- query_info = "SELECT sentence FROM english_sentences"
+ query_info = "SELECT sentence FROM English_sentences"
self.cursor.execute(query_info)
sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
return sentences_dataframe
@@ -130,7 +130,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[
words = self.udt_pre_model.word_segmentation(sent)
word_vectors = []
# iterator to word
- window_words = get_keyword_window(self.sel_result[0][0], words, 5)
+ window_words = get_keyword_window(self.sel_result[0][0], words, 10)
for word in window_words:
if word in word2vec_model.wv:
word_vectors.append(word2vec_model.wv[word])
@@ -210,13 +210,18 @@ def _get_examples(self, sentences: List[str], best_labels, n_clusters: int):
]
save_path = './/corpus//english//'
# first loading udpipe to segement word for each sentence
+ # udt_english = UdpipeTrain(language_list[1],
+ # r'C:\Users\haris\Desktop\wordFinder\english-ewt-ud-2.5-191206.udpipe',
+ # r'C:\Users\haris\Desktop\wordFinder\haris.txt')
+
udt_english = UdpipeTrain(language_list[1],
r'.//corpus//udpipemodel//english.udpipe',
r'.//corpus//english//135-0.txt')
- cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, save_path, sentences='3', n_clusters=2)
- '''
+ #cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, save_path, sentences='3', n_clusters=2)
+ # '''
+
cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, sentences, 2)
print("two examples sentences: \n")
print(cluster_result)
diff --git a/src/templates/cluster.html b/src/templates/cluster.html
index e88a65f..d4ec5b9 100644
--- a/src/templates/cluster.html
+++ b/src/templates/cluster.html
@@ -31,6 +31,7 @@
Well done!
After clustering, you get {{cluster_number}} example
{% for cluster_sentence in cluster_result %}
{{cluster_sentence}}
+ # Add KWIC Functinality
{% endfor %}
{% endif %}
diff --git a/src/train/KWIC.py b/src/train/KWIC.py
new file mode 100644
index 0000000..bf8abf0
--- /dev/null
+++ b/src/train/KWIC.py
@@ -0,0 +1,244 @@
+def getNGrams(wordlist, n):
+ return [wordlist[i:i + n] for i in range(len(wordlist) - (n - 1))]
+
+# Given a list of n-grams, return a dictionary of KWICs,
+# indexed by keyword.
+
+def nGramsToKWICDict(ngrams):
+ keyindex = len(ngrams[0]) // 2
+
+ kwicdict = {}
+
+ for k in ngrams:
+ if k[keyindex] not in kwicdict:
+ kwicdict[k[keyindex]] = [k]
+ else:
+ kwicdict[k[keyindex]].append(k)
+ return kwicdict
+
+
+# Given a KWIC, return a string that is formatted for
+# pretty printing.
+
+def prettyPrintKWIC(kwic):
+ n = len(kwic)
+ keyindex = n // 2
+ width = 10
+
+ outstring = ' '.join(kwic[:keyindex]).rjust(width * keyindex)
+ outstring += str(kwic[keyindex]).center(len(kwic[keyindex]) + 6)
+ outstring += ' '.join(kwic[(keyindex + 1):])
+
+ return outstring
+
+def cut_to_sentence(text, keyword, keywordindex):
+ """ Cuts the sentence around a keyword out of the text
+ Arguments
+ ----------
+ text : str
+ Text out of which the sentence should be extracted
+ keyword : str
+ Keyword in the sentence of the text
+ keywordindex: int
+ Index of the keyword in the text
+ Returns
+ -------
+ Indices of of the sentence in the text and a string of the sentence
+ """
+ # Strings after wich a point does not end a sentence
+ safe = ["Ms", "Mr", "Fr", "Hr", "Dipl", "B", "M", "Sc", "Dr", "Prof",
+ "Mo", "Mon", "Di", "Tu", "Tue", "Tues", "Mi", "Wed", "Do", "Th",
+ "Thu", "Thur", "Thurs", "Fr", "Fri", "Sa", "Sat", "So", "Sun",
+ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
+ "str"]
+
+ # Find beginning
+ rfind_results = []
+ end_ = keywordindex
+ # Special Case "."
+ while True:
+ rfind_ = text.rfind(". ", 0, end_)
+ if not rfind_ == -1:
+ no_safe = False
+ for i, s in enumerate(safe):
+ if text[0:rfind_][::-1].find(s[::-1]) == 0:
+ end_ = rfind_ - len(s)
+ break
+ if i == len(safe)-1:
+ no_safe = True
+ if no_safe is True:
+ break
+ else:
+ break
+ rfind_results.append(rfind_)
+
+ rfind_results.append(max([text.rfind(sentence_ending, 0, keywordindex)
+ for sentence_ending in ["! ", "? "]]))
+
+ rfind_result = max(rfind_results)
+ if rfind_result == -1:
+ start = 0
+ else:
+ start = rfind_result + 2
+
+ # Find ending
+ find_results = []
+ start_ = keywordindex+len(keyword)
+ # Special Case "."
+ while True:
+ find_ = text.find(". ", start_)
+ if not find_ == -1:
+ no_safe = False
+ for i, s in enumerate(safe):
+ if text[0:find_][::-1].find(s[::-1]) == 0:
+ start_ = find_ + len(s)
+ break
+ if i == len(safe)-1:
+ no_safe = True
+ if no_safe is True:
+ break
+ else:
+ break
+ find_results.append(find_)
+
+ find_results.extend([text.find(sentence_ending, keywordindex+len(keyword))
+ for sentence_ending in ["! ", "? "]])
+ find_results_bigger_neg_1 = [i for i in find_results if i >= 0]
+ if not find_results_bigger_neg_1:
+ end = len(text)
+ else:
+ end = min(find_results_bigger_neg_1) + 1
+
+ return list(range(start, end)), text[start:end]
+
+def find_nth_occurrence(text, searchstr, nth=1, startindex=0):
+ """
+ Finds the index of the nth occurence of a searchstr in the text starting
+ from the a given startindex.
+ """
+ start = text.find(searchstr, startindex)
+
+ if start == -1:
+ return len(text)-1
+
+ for i in range(nth-1):
+ find_index = text.find(searchstr, start+len(searchstr))
+ if find_index == -1:
+ return len(text)-1
+ else:
+ start = find_index
+
+ return start
+
+def rfind_nth_occurrence(text, searchstr, nth=1, endindex=None):
+ """
+ Finds the index of the nth occurence of a searchstr in the text going
+ backwards from a given endindex.
+ """
+ if endindex is None:
+ endindex = len(text)
+
+ end = text.rfind(searchstr, 0, endindex)
+
+ if end == -1:
+ return 0
+
+ for i in range(nth-1):
+ rfind_index = text.rfind(searchstr, 0, end)
+ if rfind_index == -1:
+ return 0
+ else:
+ end = rfind_index
+
+ return end
+
+def keywords_in_context(text, keywords, max_words=5, sep="...", cut_sentences=True):
+ """ Returns the relevant context around keywords in a larger text.
+ Arguments
+ ----------
+ text : str
+ Text which should be summerized around keywords.
+ keywords : list of str
+ Keywords whose context we want to extract out of the text.
+ max_words : int
+ Maximum number of words before und after a keyword if no sentence
+ beginning or ending occurs and cut_sentences is set.
+ sep : str
+ String wich represents skipped portions of the text in the result.
+ cut_sentences : bool
+ Set if the context around a keyword is cut at the beginning or end of
+ a sentence
+ Returns
+ -------
+ Summarised text containing the keywords in context as string.
+ """
+ indices_lst = []
+ for k in keywords:
+ start = text.find(k)
+ while not start == -1:
+ indices_lst.append((k, start))
+ start = text.find(k, start+len(k))
+
+ result_indices = set()
+ for index_tpl in indices_lst:
+ keyword, index = index_tpl
+ start = rfind_nth_occurrence(text, " ", nth=max_words+1, endindex=index)
+ if not start == 0:
+ start += 1 # +1 to Remove the first " "
+ end = find_nth_occurrence(text, " ", nth=max_words+1, startindex=index+len(keyword))
+ if end == len(text)-1:
+ end += 1
+ indices_of_text = set(range(start, end))
+ if cut_sentences:
+ sentence_indices, _ = cut_to_sentence(text, keyword, index)
+ indices_of_text.intersection_update(set(sentence_indices))
+ for i in indices_of_text:
+ result_indices.add(i)
+
+ result_indices = list(result_indices)
+ result_indices.sort()
+
+ result = ""
+ i_before = -1
+ for _i, i in enumerate(result_indices):
+ if not (i-1) == i_before:
+ result += " " + sep + " " + text[i]
+ i_before = i
+ else:
+ result += text[i]
+ i_before = i
+
+ # If the last word is not the end of the text add the sperator.
+ if _i == len(result_indices)-1:
+ if not i == len(text)-1:
+ result += " " + sep
+
+ return result
+
+def find_and_replace(text, find_str, replacement_str):
+ """ Find and replace a find_str with a replacement_str in text. """
+ start = text.find(find_str)
+ offset = 0
+ while start != -1:
+ # update the index compatible to the whole text
+ start = start + offset
+
+ # replace (cut the original word out and insert the replacement)
+ text = text[:start] + replacement_str + text[start+len(find_str):]
+
+ offset = start + len(replacement_str)
+ start = text[offset:].find(find_str)
+
+ return text
+
+if __name__ == "__main__":
+ """
+ Text = Sentence which needs to be shrinked
+ Keyword = Searched word
+ """
+ result_text = keywords_in_context(TEXT, KEYWORDS)
+ # Highlight Keywords
+ for k in KEYWORDS:
+ result_text = find_and_replace(result_text, k, "\x1b[34m"+k+"\x1b[0m")
+
+ print(result_text)
diff --git a/src/train/store.py b/src/train/store.py
index 8cf1dd5..853903c 100644
--- a/src/train/store.py
+++ b/src/train/store.py
@@ -175,10 +175,11 @@ def select_data(self, cursor, word, language):
# put config info of database to db_config variable
store_data = StoreData(db_config['user'],
db_config['password'],
- db_config['host'],
- db_config['database']
+ db_config['db_host'],
+ db_config['db_name']
)
conn = store_data.db_connect()
+
store_data.create_database(conn.cursor())
store_data.create_tables(conn.cursor(), TABLES, TABLES_SENTENCES)
print('TABLES CREATED: SUCCESS')
diff --git a/src/train/train_cluster.py b/src/train/train_cluster.py
index 429bb9b..b9fb294 100644
--- a/src/train/train_cluster.py
+++ b/src/train/train_cluster.py
@@ -128,9 +128,9 @@ def batch():
print('please input word vector filepath')
# first loading udpipe to segement word for each sentence
- udt_english = UdpipeTrain(languange_name, udpipe_pre_model_path, corpus_filepath)
+ udt_english = UdpipeTrain(language_name, udpipe_pre_model_path, corpus_filepath)
# second train to get the word2vec udpipemodel
- train_model(languange_name, corpus_filepath, file_path, udt_english)
+ train_model(language_name, corpus_filepath, file_path, udt_english)
# finally, after train we can load udpipemodel to use directly
load_model(file_path)
print('All done')
@@ -140,11 +140,12 @@ def batch():
# udt_lang = UdpipeTrain(lang, udpipe_pre_model_path, corpus_filepath)
# second train to get the word2vec model
# word2vec_result_file = 'corpus//word2vecmodel//gensim-word2vec-model-'
+ # word2vec_result_file = 'input//word2vecmodel//gensim-word2vec-model-'
# train_model(lang, corpus_filepath, word2vec_result_file, udt_lang)
-if __name__ == "__main__":
- batch()
+#if __name__ == "__main__":
+ #batch()
# languange_name = 'English'
#
# # input example
@@ -172,11 +173,5 @@ def batch():
# file_path = args.wvfp
# else:
# print('please input word vector filepath')
- #
- # # first loading udpipe to segement word for each sentence
- # udt_english = UdpipeTrain(languange_name, udpipe_pre_model_path, corpus_filepath)
- # # second train to get the word2vec model
- # train_model(languange_name, corpus_filepath, file_path, udpipe_pre_model_path)
- # # finally, after train we can load model to use directly
- # # load_model(file_path)
- # print('All done')
+
+
diff --git a/src/train/train_model.py b/src/train/train_model.py
index 11b57a6..b5d0494 100644
--- a/src/train/train_model.py
+++ b/src/train/train_model.py
@@ -4,7 +4,6 @@
Remember: working directory needed to be set to wordfinder!
"""
-
# third-party modules
import string
import re
@@ -33,8 +32,8 @@ def __init__(self, language_name, pre_model_name, our_corpus_name):
try:
self.store_data = StoreData(db_config['user'],
db_config['password'],
- db_config['host'],
- db_config['database'])
+ db_config['db_host'],
+ db_config['db_name'])
self.cursor = self.store_data.db_connect().cursor()
# second loading udpipe pre-train model
self.model = Model(self.pre_model_name)
@@ -60,7 +59,7 @@ def clean_data(self, data: str) -> str:
"""
cleaned_data = re.sub('\w*\d\w*', '', data)
cleaned_data = re.sub('\[.*?\]', '', cleaned_data)
- cleaned_data = re.sub('[‘’“”…]','',cleaned_data)
+ cleaned_data = re.sub('[‘’“”…]', '', cleaned_data)
cleaned_data = re.sub(r'\\t | \\n', '', cleaned_data)
return cleaned_data
@@ -83,7 +82,7 @@ def do_train(self) -> List[TResult]:
for i, one_sentence in enumerate(word_pos):
sentence_text = self.extract_one_sentence(one_sentence)
results = self.extract_one_word(one_sentence, sentence_text)
- self.store_data.insert_data(self.cursor, results, self.language_name)
+ # self.store_data.insert_data(self.cursor, results, self.language_name)
print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
line_no += 1
print(' all written succeed for corpus of %s' % self.our_corpus_name)
@@ -137,7 +136,7 @@ def extract_one_word(self, sentence, sentence_text: str) -> [TResult]:
for word in sentence.words:
if word.lemma and word.lemma not in string.punctuation:
if word.lemma and word.upostag and sentence_text:
- combined_words .append(TResult(word.lemma, word.upostag, sentence_text))
+ combined_words.append(TResult(word.lemma, word.upostag, sentence_text))
self._word_count += 1
return combined_words
@@ -182,12 +181,14 @@ def batch_train():
udpipe_pre_model_path = udpipe_language[lang]
corpus_filepath = corpus_language[lang]
train_model = UdpipeTrain(lang, udpipe_pre_model_path, corpus_filepath)
- print('begin train %s corpus' % (lang, ))
+ print('begin train %s corpus' % (lang,))
train_model.do_train()
print('done train %s corpus' % (lang,))
if __name__ == '__main__':
+
+ # batch_train()
batch_train()
parser = argparse.ArgumentParser(description='train corpus to get word, pos, and related sentence')
parser.add_argument('-udfp', help='udpipe pre-model filepath')
@@ -201,9 +202,15 @@ def batch_train():
corpus_filepath = args.cfp
else:
print('please input corpus filepath')
- # Italian
+
+# English
+ udt_english = UdpipeTrain(language_list[1], udpipe_pre_model_path, corpus_filepath)
+ udt_english.do_train()
+
+ ''' # Italian
udt_chinese = UdpipeTrain(language_list[0], udpipe_pre_model_path, corpus_filepath)
- udt_chinese.do_train()
+ udt_chinese.do_train()
+ ''''
'''
# Chinese
udt_chinese = UdpipeTrain(language_list[0], udpipe_pre_model_path, corpus_filepath)
@@ -268,4 +275,4 @@ def batch_train():
# Spanish
udt_spanish = UdpipeTrain(language_list[15], udpipe_pre_model_path, corpus_filepath)
udt_spanish.do_train()
-'''
\ No newline at end of file
+'''
diff --git a/src/util.py b/src/util.py
index 013e58e..55e3af3 100644
--- a/src/util.py
+++ b/src/util.py
@@ -7,15 +7,11 @@
# database config
# cofig for local database
-db_config = {
- 'host': 'psd-wordfinder.mysql.database.azure.com',
- 'database': 'psd_project',
- 'user': 'adminteam@psd-wordfinder',
- 'password': 'jFq&T7bPJXmY',
- #'client_flags': [mysql.connector.ClientFlag.SSL],
- #'ssl_ca': './/src//train//DigiCertGlobalRootG2.crt.pem' #vscode
- 'ssl_ca': 'DigiCertGlobalRootG2.crt.pem' #pycharm
-}
+db_config = {'user': 'root',
+ 'password': 'root',
+ 'db_host': 'localhost',
+ 'db_name': 'psd_project'}
+
language_list = [
@@ -115,7 +111,6 @@
'Spanish': './corpus/word2vecmodel/gensim-word2vec-udpipemodel-Spanish'
}
-
def get_keyword_window(sel_word: str, words_of_sentence: List, length=5) -> List[str]:
"""
find the index of sel_word at sentence, then decide words of @length size