diff --git a/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe b/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe new file mode 100644 index 0000000..7f16e14 Binary files /dev/null and b/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe differ diff --git a/requirements.txt b/requirements.txt index 523768a..39d6d74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ -requests==2.25.1 -nltk==3.5 -numpy~=1.19.2 -beautifulsoup4==4.9.3 -corpy==0.3.0 -Flask==1.1.2 -gensim==3.8.3 -pymysql==1.0.2 -pysolr-3.9.0 -mysql~=5.7.24 +requests==2.25.1 +nltk==3.5 +numpy~=1.19.2 +beautifulsoup4==4.9.3 +corpy==0.3.0 +Flask==1.1.2 +gensim==3.8.3 +pymysql==1.0.2 +pysolr-3.9.0 +mysql~=5.7.24 scikit-learn~=0.24.1 \ No newline at end of file diff --git a/src/app.py b/src/app.py index ded21ae..d5f4461 100644 --- a/src/app.py +++ b/src/app.py @@ -8,13 +8,11 @@ from src.service import AppService from flask import Flask, render_template, request, redirect, url_for, flash - app = Flask(__name__) # TODO: need to change with the selection different language appService = AppService() - @app.route('/') def index(): """ @@ -23,7 +21,6 @@ def index(): """ return render_template('index.html') - @app.route('/find', methods=['POST']) def find(): """ @@ -45,7 +42,6 @@ def find(): "sel_word": sel_word, "sel_result": appService.sel_result}) - @app.route('/find2', methods=['POST']) def find2(): language_name, sel_word = None, None @@ -59,7 +55,6 @@ def find2(): "sel_word": sel_word, "sel_result": appService.sel_result}) - @app.route('/cluster', methods=['POST']) def cluster(): """ @@ -76,13 +71,8 @@ def cluster(): if not appService.udt_pre_model: appService.config_udpipe(language_name) cluster_model_file = word2vec_language[language_name] - cluster_result, rec_cluster_result = appService.cluster_sentences( - language_name, cluster_model_file, cluster_input_sentence, cluster_number) - return render_template('cluster.html', - cluster_number=cluster_number, - cluster_result=cluster_result, - rec_cluster_result=rec_cluster_result) - + cluster_result, rec_cluster_result = appService.cluster_sentences(language_name, cluster_model_file,cluster_input_sentence, cluster_number) + return render_template('cluster.html',cluster_number=cluster_number,cluster_result=cluster_result,rec_cluster_result=rec_cluster_result) if __name__ == '__main__': - app.run(port=3000, debug=True) \ No newline at end of file + app.run(port=3000, debug=True) diff --git a/src/databaseClustering.py b/src/databaseClustering.py index 34a29ae..57545f4 100644 --- a/src/databaseClustering.py +++ b/src/databaseClustering.py @@ -1,4 +1,4 @@ -import mysql.connector +import pymysql.connector from mysql.connector import errorcode from datetime import datetime import pandas as pd @@ -11,32 +11,44 @@ from util import db_config - def train_model(language_name, corpus_path, save_path): - - model = gensim.models.Word2Vec(sentences=corpus_path, - size=150, - window=8, - min_count=2, - workers=2, - iter=10) - model.save(save_path + language_name) - print('Save succeed') + model = gensim.models.Word2Vec(sentences=corpus_path, + size=150, + window=8, + min_count=2, + workers=2, + iter=10) + model.save(save_path + language_name) + print('Save succeed') def load_model(save_path) -> gensim.models.Word2Vec: - filename = save_path - model = gensim.models.Word2Vec.load(filename) - print('Loading succeed') - for index, word in enumerate(model.wv.index2word): - if index == 5: - break - vec = ",".join(map(lambda i: str(i), model.wv[word])) - print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}") - return model - -def database(): - + filename = save_path + model = gensim.models.Word2Vec.load(filename) + print('Loading succeed') + for index, word in enumerate(model.wv.index2word): + if index == 5: + break + vec = ",".join(map(lambda i: str(i), model.wv[word])) + print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}") + return model + + +# def database(): +# db = mysql.connector.connect( +# host='localhost', +# user='root', +# password='root', +# database='psd_project' +# ) +# mycursor = db.cursor() +# query_info = ("SELECT sentence FROM english_sentences") +# mycursor.execute(query_info) +# sentences_df = pd.DataFrame(mycursor.fetchall(), columns=['Sentences']) + +# return sentences_df + +#either 37- 48 lines code or 52-63 code needed to be there db = mysql.connector.connect( host=db_config['host'], user=db_config['user'], @@ -50,76 +62,80 @@ def database(): return sentences_df + def textProcessing(text): - no_stop =[words for words in text.split() if words.lower() not in string.punctuation] + no_stop = [words for words in text.split() if words.lower() not in string.punctuation] return no_stop -def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int) : - - n_clusters = int(n_clusters) - print("clusters are ",n_clusters) - if n_clusters <=0: - print("Parameter is Invalid") - return - if n_clusters > len(sentences): - # TODO add log - print('number of cluster bigger than sentences count') - return - # first loading model - word2vec_model = load_model(save_path) - # second geting vectors for one sentence - sent_vectors = [] - default_dimn = 100 - # iterator to sentence - for word1 in sentences: - print(word1) - word_vectors = [] - for words in word1: - - if words in word2vec_model.wv: - word_vectors.append(word2vec_model.wv[words]) - else: # not in dict, fill 0 - word_vectors.append([0] * default_dimn) - - to_array = np.array(word_vectors) - sent_vectors.append(to_array.mean(axis=0).tolist()) - kmeans = KMeans(n_clusters=n_clusters,random_state=0).fit(sent_vectors) - labels = kmeans.labels_ - tmp_labels,examples = [],[] - for sent,label in zip(sentences,labels): - if label not in tmp_labels: - tmp_labels.append(label) - examples.append(sent) - if len(examples) == n_clusters: - break - # add bottom logic for cluster - if len(examples) < n_clusters: - for sent in sentences: - if sent not in examples: - examples.append(sent) - if len(examples) >= n_clusters: - break - - return examples + +def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int): + n_clusters = int(n_clusters) + print("clusters are ", n_clusters) + if n_clusters <= 0: + print("Parameter is Invalid") + return + if n_clusters > len(sentences): + # TODO add log + print('number of cluster bigger than sentences count') + return + # first loading model + word2vec_model = load_model(save_path) + # second geting vectors for one sentence + sent_vectors = [] + default_dimn = 100 + # iterator to sentence + for word1 in sentences: + print(word1) + word_vectors = [] + for words in word1: + + if words in word2vec_model.wv: + word_vectors.append(word2vec_model.wv[words]) + else: # not in dict, fill 0 + word_vectors.append([0] * default_dimn) + + to_array = np.array(word_vectors) + sent_vectors.append(to_array.mean(axis=0).tolist()) + kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sent_vectors) + labels = kmeans.labels_ + tmp_labels, examples = [], [] + for sent, label in zip(sentences, labels): + if label not in tmp_labels: + tmp_labels.append(label) + examples.append(sent) + if len(examples) == n_clusters: + break + # add bottom logic for cluster + if len(examples) < n_clusters: + for sent in sentences: + if sent not in examples: + examples.append(sent) + if len(examples) >= n_clusters: + break + + return examples a = database() + +# file_path = r'C:\Users\haris\Desktop\wordFinder\word2vec' +# file_path = file_path + 'English' + file_path = './corpus/word2vecmodel/' language_name = 'english' file_path = file_path + language_name load_model(file_path) print('All done') -c=a['Sentences'].apply(textProcessing) +c = a['Sentences'].apply(textProcessing) # get word vector for one sentence sentences = [ - 'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.', - 'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.', - 'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.', - 'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.'] + 'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.', + 'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.', + 'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.', + 'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.'] cluster_result = cluster_sentences(language_name, file_path,c,3) print("two examples sentences: \n") print(cluster_result) - diff --git a/src/service.py b/src/service.py index 7b4e6bd..e826570 100644 --- a/src/service.py +++ b/src/service.py @@ -80,7 +80,7 @@ def database(self): db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() - query_info = "SELECT sentence FROM english_sentences" + query_info = "SELECT sentence FROM English_sentences" self.cursor.execute(query_info) sentences_df = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences']) return sentences_df @@ -91,7 +91,7 @@ def clusteringData(self): db_config['host'], db_config['database']) self.cursor = self.store_data.db_connect().cursor() - query_info = "SELECT sentence FROM english_sentences" + query_info = "SELECT sentence FROM English_sentences" self.cursor.execute(query_info) sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences']) return sentences_dataframe @@ -130,7 +130,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[ words = self.udt_pre_model.word_segmentation(sent) word_vectors = [] # iterator to word - window_words = get_keyword_window(self.sel_result[0][0], words, 5) + window_words = get_keyword_window(self.sel_result[0][0], words, 10) for word in window_words: if word in word2vec_model.wv: word_vectors.append(word2vec_model.wv[word]) @@ -210,13 +210,18 @@ def _get_examples(self, sentences: List[str], best_labels, n_clusters: int): ] save_path = './/corpus//english//' # first loading udpipe to segement word for each sentence + # udt_english = UdpipeTrain(language_list[1], + # r'C:\Users\haris\Desktop\wordFinder\english-ewt-ud-2.5-191206.udpipe', + # r'C:\Users\haris\Desktop\wordFinder\haris.txt') + udt_english = UdpipeTrain(language_list[1], r'.//corpus//udpipemodel//english.udpipe', r'.//corpus//english//135-0.txt') - cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, save_path, sentences='3', n_clusters=2) - ''' + #cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, save_path, sentences='3', n_clusters=2) + # ''' + cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, sentences, 2) print("two examples sentences: \n") print(cluster_result) diff --git a/src/templates/cluster.html b/src/templates/cluster.html index e88a65f..d4ec5b9 100644 --- a/src/templates/cluster.html +++ b/src/templates/cluster.html @@ -31,6 +31,7 @@

Well done!

After clustering, you get {{cluster_number}} example {% for cluster_sentence in cluster_result %}
  • {{cluster_sentence}} + # Add KWIC Functinality
  • {% endfor %} {% endif %} diff --git a/src/train/KWIC.py b/src/train/KWIC.py new file mode 100644 index 0000000..bf8abf0 --- /dev/null +++ b/src/train/KWIC.py @@ -0,0 +1,244 @@ +def getNGrams(wordlist, n): + return [wordlist[i:i + n] for i in range(len(wordlist) - (n - 1))] + +# Given a list of n-grams, return a dictionary of KWICs, +# indexed by keyword. + +def nGramsToKWICDict(ngrams): + keyindex = len(ngrams[0]) // 2 + + kwicdict = {} + + for k in ngrams: + if k[keyindex] not in kwicdict: + kwicdict[k[keyindex]] = [k] + else: + kwicdict[k[keyindex]].append(k) + return kwicdict + + +# Given a KWIC, return a string that is formatted for +# pretty printing. + +def prettyPrintKWIC(kwic): + n = len(kwic) + keyindex = n // 2 + width = 10 + + outstring = ' '.join(kwic[:keyindex]).rjust(width * keyindex) + outstring += str(kwic[keyindex]).center(len(kwic[keyindex]) + 6) + outstring += ' '.join(kwic[(keyindex + 1):]) + + return outstring + +def cut_to_sentence(text, keyword, keywordindex): + """ Cuts the sentence around a keyword out of the text + Arguments + ---------- + text : str + Text out of which the sentence should be extracted + keyword : str + Keyword in the sentence of the text + keywordindex: int + Index of the keyword in the text + Returns + ------- + Indices of of the sentence in the text and a string of the sentence + """ + # Strings after wich a point does not end a sentence + safe = ["Ms", "Mr", "Fr", "Hr", "Dipl", "B", "M", "Sc", "Dr", "Prof", + "Mo", "Mon", "Di", "Tu", "Tue", "Tues", "Mi", "Wed", "Do", "Th", + "Thu", "Thur", "Thurs", "Fr", "Fri", "Sa", "Sat", "So", "Sun", + "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + "str"] + + # Find beginning + rfind_results = [] + end_ = keywordindex + # Special Case "." + while True: + rfind_ = text.rfind(". ", 0, end_) + if not rfind_ == -1: + no_safe = False + for i, s in enumerate(safe): + if text[0:rfind_][::-1].find(s[::-1]) == 0: + end_ = rfind_ - len(s) + break + if i == len(safe)-1: + no_safe = True + if no_safe is True: + break + else: + break + rfind_results.append(rfind_) + + rfind_results.append(max([text.rfind(sentence_ending, 0, keywordindex) + for sentence_ending in ["! ", "? "]])) + + rfind_result = max(rfind_results) + if rfind_result == -1: + start = 0 + else: + start = rfind_result + 2 + + # Find ending + find_results = [] + start_ = keywordindex+len(keyword) + # Special Case "." + while True: + find_ = text.find(". ", start_) + if not find_ == -1: + no_safe = False + for i, s in enumerate(safe): + if text[0:find_][::-1].find(s[::-1]) == 0: + start_ = find_ + len(s) + break + if i == len(safe)-1: + no_safe = True + if no_safe is True: + break + else: + break + find_results.append(find_) + + find_results.extend([text.find(sentence_ending, keywordindex+len(keyword)) + for sentence_ending in ["! ", "? "]]) + find_results_bigger_neg_1 = [i for i in find_results if i >= 0] + if not find_results_bigger_neg_1: + end = len(text) + else: + end = min(find_results_bigger_neg_1) + 1 + + return list(range(start, end)), text[start:end] + +def find_nth_occurrence(text, searchstr, nth=1, startindex=0): + """ + Finds the index of the nth occurence of a searchstr in the text starting + from the a given startindex. + """ + start = text.find(searchstr, startindex) + + if start == -1: + return len(text)-1 + + for i in range(nth-1): + find_index = text.find(searchstr, start+len(searchstr)) + if find_index == -1: + return len(text)-1 + else: + start = find_index + + return start + +def rfind_nth_occurrence(text, searchstr, nth=1, endindex=None): + """ + Finds the index of the nth occurence of a searchstr in the text going + backwards from a given endindex. + """ + if endindex is None: + endindex = len(text) + + end = text.rfind(searchstr, 0, endindex) + + if end == -1: + return 0 + + for i in range(nth-1): + rfind_index = text.rfind(searchstr, 0, end) + if rfind_index == -1: + return 0 + else: + end = rfind_index + + return end + +def keywords_in_context(text, keywords, max_words=5, sep="...", cut_sentences=True): + """ Returns the relevant context around keywords in a larger text. + Arguments + ---------- + text : str + Text which should be summerized around keywords. + keywords : list of str + Keywords whose context we want to extract out of the text. + max_words : int + Maximum number of words before und after a keyword if no sentence + beginning or ending occurs and cut_sentences is set. + sep : str + String wich represents skipped portions of the text in the result. + cut_sentences : bool + Set if the context around a keyword is cut at the beginning or end of + a sentence + Returns + ------- + Summarised text containing the keywords in context as string. + """ + indices_lst = [] + for k in keywords: + start = text.find(k) + while not start == -1: + indices_lst.append((k, start)) + start = text.find(k, start+len(k)) + + result_indices = set() + for index_tpl in indices_lst: + keyword, index = index_tpl + start = rfind_nth_occurrence(text, " ", nth=max_words+1, endindex=index) + if not start == 0: + start += 1 # +1 to Remove the first " " + end = find_nth_occurrence(text, " ", nth=max_words+1, startindex=index+len(keyword)) + if end == len(text)-1: + end += 1 + indices_of_text = set(range(start, end)) + if cut_sentences: + sentence_indices, _ = cut_to_sentence(text, keyword, index) + indices_of_text.intersection_update(set(sentence_indices)) + for i in indices_of_text: + result_indices.add(i) + + result_indices = list(result_indices) + result_indices.sort() + + result = "" + i_before = -1 + for _i, i in enumerate(result_indices): + if not (i-1) == i_before: + result += " " + sep + " " + text[i] + i_before = i + else: + result += text[i] + i_before = i + + # If the last word is not the end of the text add the sperator. + if _i == len(result_indices)-1: + if not i == len(text)-1: + result += " " + sep + + return result + +def find_and_replace(text, find_str, replacement_str): + """ Find and replace a find_str with a replacement_str in text. """ + start = text.find(find_str) + offset = 0 + while start != -1: + # update the index compatible to the whole text + start = start + offset + + # replace (cut the original word out and insert the replacement) + text = text[:start] + replacement_str + text[start+len(find_str):] + + offset = start + len(replacement_str) + start = text[offset:].find(find_str) + + return text + +if __name__ == "__main__": + """ + Text = Sentence which needs to be shrinked + Keyword = Searched word + """ + result_text = keywords_in_context(TEXT, KEYWORDS) + # Highlight Keywords + for k in KEYWORDS: + result_text = find_and_replace(result_text, k, "\x1b[34m"+k+"\x1b[0m") + + print(result_text) diff --git a/src/train/store.py b/src/train/store.py index 8cf1dd5..853903c 100644 --- a/src/train/store.py +++ b/src/train/store.py @@ -175,10 +175,11 @@ def select_data(self, cursor, word, language): # put config info of database to db_config variable store_data = StoreData(db_config['user'], db_config['password'], - db_config['host'], - db_config['database'] + db_config['db_host'], + db_config['db_name'] ) conn = store_data.db_connect() + store_data.create_database(conn.cursor()) store_data.create_tables(conn.cursor(), TABLES, TABLES_SENTENCES) print('TABLES CREATED: SUCCESS') diff --git a/src/train/train_cluster.py b/src/train/train_cluster.py index 429bb9b..b9fb294 100644 --- a/src/train/train_cluster.py +++ b/src/train/train_cluster.py @@ -128,9 +128,9 @@ def batch(): print('please input word vector filepath') # first loading udpipe to segement word for each sentence - udt_english = UdpipeTrain(languange_name, udpipe_pre_model_path, corpus_filepath) + udt_english = UdpipeTrain(language_name, udpipe_pre_model_path, corpus_filepath) # second train to get the word2vec udpipemodel - train_model(languange_name, corpus_filepath, file_path, udt_english) + train_model(language_name, corpus_filepath, file_path, udt_english) # finally, after train we can load udpipemodel to use directly load_model(file_path) print('All done') @@ -140,11 +140,12 @@ def batch(): # udt_lang = UdpipeTrain(lang, udpipe_pre_model_path, corpus_filepath) # second train to get the word2vec model # word2vec_result_file = 'corpus//word2vecmodel//gensim-word2vec-model-' + # word2vec_result_file = 'input//word2vecmodel//gensim-word2vec-model-' # train_model(lang, corpus_filepath, word2vec_result_file, udt_lang) -if __name__ == "__main__": - batch() +#if __name__ == "__main__": + #batch() # languange_name = 'English' # # # input example @@ -172,11 +173,5 @@ def batch(): # file_path = args.wvfp # else: # print('please input word vector filepath') - # - # # first loading udpipe to segement word for each sentence - # udt_english = UdpipeTrain(languange_name, udpipe_pre_model_path, corpus_filepath) - # # second train to get the word2vec model - # train_model(languange_name, corpus_filepath, file_path, udpipe_pre_model_path) - # # finally, after train we can load model to use directly - # # load_model(file_path) - # print('All done') + + diff --git a/src/train/train_model.py b/src/train/train_model.py index 11b57a6..b5d0494 100644 --- a/src/train/train_model.py +++ b/src/train/train_model.py @@ -4,7 +4,6 @@ Remember: working directory needed to be set to wordfinder! """ - # third-party modules import string import re @@ -33,8 +32,8 @@ def __init__(self, language_name, pre_model_name, our_corpus_name): try: self.store_data = StoreData(db_config['user'], db_config['password'], - db_config['host'], - db_config['database']) + db_config['db_host'], + db_config['db_name']) self.cursor = self.store_data.db_connect().cursor() # second loading udpipe pre-train model self.model = Model(self.pre_model_name) @@ -60,7 +59,7 @@ def clean_data(self, data: str) -> str: """ cleaned_data = re.sub('\w*\d\w*', '', data) cleaned_data = re.sub('\[.*?\]', '', cleaned_data) - cleaned_data = re.sub('[‘’“”…]','',cleaned_data) + cleaned_data = re.sub('[‘’“”…]', '', cleaned_data) cleaned_data = re.sub(r'\\t | \\n', '', cleaned_data) return cleaned_data @@ -83,7 +82,7 @@ def do_train(self) -> List[TResult]: for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) - self.store_data.insert_data(self.cursor, results, self.language_name) + # self.store_data.insert_data(self.cursor, results, self.language_name) print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name)) line_no += 1 print(' all written succeed for corpus of %s' % self.our_corpus_name) @@ -137,7 +136,7 @@ def extract_one_word(self, sentence, sentence_text: str) -> [TResult]: for word in sentence.words: if word.lemma and word.lemma not in string.punctuation: if word.lemma and word.upostag and sentence_text: - combined_words .append(TResult(word.lemma, word.upostag, sentence_text)) + combined_words.append(TResult(word.lemma, word.upostag, sentence_text)) self._word_count += 1 return combined_words @@ -182,12 +181,14 @@ def batch_train(): udpipe_pre_model_path = udpipe_language[lang] corpus_filepath = corpus_language[lang] train_model = UdpipeTrain(lang, udpipe_pre_model_path, corpus_filepath) - print('begin train %s corpus' % (lang, )) + print('begin train %s corpus' % (lang,)) train_model.do_train() print('done train %s corpus' % (lang,)) if __name__ == '__main__': + + # batch_train() batch_train() parser = argparse.ArgumentParser(description='train corpus to get word, pos, and related sentence') parser.add_argument('-udfp', help='udpipe pre-model filepath') @@ -201,9 +202,15 @@ def batch_train(): corpus_filepath = args.cfp else: print('please input corpus filepath') - # Italian + +# English + udt_english = UdpipeTrain(language_list[1], udpipe_pre_model_path, corpus_filepath) + udt_english.do_train() + + ''' # Italian udt_chinese = UdpipeTrain(language_list[0], udpipe_pre_model_path, corpus_filepath) - udt_chinese.do_train() + udt_chinese.do_train() + '''' ''' # Chinese udt_chinese = UdpipeTrain(language_list[0], udpipe_pre_model_path, corpus_filepath) @@ -268,4 +275,4 @@ def batch_train(): # Spanish udt_spanish = UdpipeTrain(language_list[15], udpipe_pre_model_path, corpus_filepath) udt_spanish.do_train() -''' \ No newline at end of file +''' diff --git a/src/util.py b/src/util.py index 013e58e..55e3af3 100644 --- a/src/util.py +++ b/src/util.py @@ -7,15 +7,11 @@ # database config # cofig for local database -db_config = { - 'host': 'psd-wordfinder.mysql.database.azure.com', - 'database': 'psd_project', - 'user': 'adminteam@psd-wordfinder', - 'password': 'jFq&T7bPJXmY', - #'client_flags': [mysql.connector.ClientFlag.SSL], - #'ssl_ca': './/src//train//DigiCertGlobalRootG2.crt.pem' #vscode - 'ssl_ca': 'DigiCertGlobalRootG2.crt.pem' #pycharm -} +db_config = {'user': 'root', + 'password': 'root', + 'db_host': 'localhost', + 'db_name': 'psd_project'} + language_list = [ @@ -115,7 +111,6 @@ 'Spanish': './corpus/word2vecmodel/gensim-word2vec-udpipemodel-Spanish' } - def get_keyword_window(sel_word: str, words_of_sentence: List, length=5) -> List[str]: """ find the index of sel_word at sentence, then decide words of @length size