From 2acac880fc7de656bcfdd51c876b44c7bde70771 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Fri, 23 Apr 2021 08:11:39 +0800 Subject: [PATCH 01/23] KWIC2 --- src/service.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/service.py b/src/service.py index 2a46af7..5ece0b0 100644 --- a/src/service.py +++ b/src/service.py @@ -168,6 +168,15 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[ return examples, recommend_sentences + def kwic(self, selword: str, sentenceWithPOS: list): + # This is similar to sentenceWithPOS but processed after KWIC + sentenceWithPOS2 = [] + for sentTuple in sentenceWithPOS: + sents = sentTuple[2] + for sent in sents: + words = sent.split(" ") + self._get_keyword_window(selword, words) + def _get_keyword_window(self, sel_word: str, words_of_sentence: List, length=5) -> List[str]: """ find the index of sel_word at sentence, then decide words of @length size From 418b3fbacf5355462b3adfbe14ea5937ee3112ad Mon Sep 17 00:00:00 2001 From: jackzhenguo Date: Fri, 23 Apr 2021 20:19:40 +0800 Subject: [PATCH 02/23] KWIC2 --- src/app.py | 3 ++- src/service.py | 29 ++++++++++++++++++++++------- src/static/js/main.js | 14 +++++--------- src/templates/result.html | 3 ++- src/train/cluster.py | 1 - 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/app.py b/src/app.py index daf3d71..bbad92b 100644 --- a/src/app.py +++ b/src/app.py @@ -43,9 +43,10 @@ def find(): if not appService.udt_pre_model: appService.config_udpipe(language_name) appService.find_service(language_name, sel_word) + sel_result_kwic = appService.kwic(sel_word, appService.sel_result) return render_template('result.html', input_data={"language_name": language_name, "sel_word": sel_word, - "sel_result": appService.sel_result}) + "sel_result": sel_result_kwic}) @app.route('/find2', methods=['POST']) diff --git a/src/service.py b/src/service.py index 5ece0b0..65ed72a 100644 --- a/src/service.py +++ b/src/service.py @@ -168,14 +168,28 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[ return examples, recommend_sentences - def kwic(self, selword: str, sentenceWithPOS: list): + def kwic(self, selword: str, sentence_with_pos: list): + """ + :param: selword + :param: sentenceWithPOS + + sentence_with_pos examples: + [("NOUN", "bank", ["I go to the bank", "The house lies the right of the river bank"]), + ("VERB", "bank", ["I banked in a slot"]) + """ # This is similar to sentenceWithPOS but processed after KWIC - sentenceWithPOS2 = [] - for sentTuple in sentenceWithPOS: - sents = sentTuple[2] - for sent in sents: + result = [] + for sentTuple in sentence_with_pos: + sents_kwic = [] + result.append((sentTuple[0], sentTuple[1], sentTuple[2], sents_kwic)) + + sents_origin = sentTuple[2] + for sent in sents_origin: words = sent.split(" ") - self._get_keyword_window(selword, words) + words2 = self._get_keyword_window(selword, words) + sents_kwic.append(" ".join(words2)) + + return result def _get_keyword_window(self, sel_word: str, words_of_sentence: List, length=5) -> List[str]: """ @@ -188,10 +202,11 @@ def _get_keyword_window(self, sel_word: str, words_of_sentence: List, length=5) remember: sel_word is lemmatized """ - if length <= 0: + if length <= 0 or len(words_of_sentence) <= length: return words_of_sentence index = words_of_sentence.index(sel_word) if index == -1: + print("warning: cannot find %s in sentence: %s".format(sel_word, words_of_sentence)) return words_of_sentence # backward is not enough if index < length // 2: diff --git a/src/static/js/main.js b/src/static/js/main.js index 41ee639..a7819ac 100644 --- a/src/static/js/main.js +++ b/src/static/js/main.js @@ -6,27 +6,23 @@ function init(){ } -function findByTag(selWord, tag, rowResult){ +function findByTag(selWord, tag, rowResult, wordResultKWIC){ /* selWord: selected word rowResult: sentences tag: POS + */ $("#tagInput1").attr("value",tag); var ulControl = $('#sentencesGroup'); ulControl.find("li").remove(); - var rowResult1 = rowResult; - if(rowResult1.length > 0){ + if(wordResultKWIC.length > 0){ $('#labelId1').show(); $('#clusterDiv1').show(); } - for(i=1; i" - + "

" + part1 + "" + part2 + "" + part3 + "

" + + + "

" + "..." + wordResultKWIC[i-1] + "..." +"

" + ""+i+""+ ""; ulControl.append(ulcontent); diff --git a/src/templates/result.html b/src/templates/result.html index 41421eb..060104a 100644 --- a/src/templates/result.html +++ b/src/templates/result.html @@ -23,7 +23,8 @@

Well done!

You successfully find {{row_result.1}} + + {% endfor %} {% endif %} diff --git a/src/train/cluster.py b/src/train/cluster.py index aeca523..d37ad37 100644 --- a/src/train/cluster.py +++ b/src/train/cluster.py @@ -5,7 +5,6 @@ date:4.2.2021 """ from sklearn import metrics -from sklearn.metrics import pairwise_distances from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import DBSCAN From ee40b0cc7ad13bd5147ab05a8b91e0103ba57ffb Mon Sep 17 00:00:00 2001 From: jackzhenguo Date: Fri, 23 Apr 2021 20:42:55 +0800 Subject: [PATCH 03/23] show all sentences labled after clustering --- src/app.py | 5 +++-- src/service.py | 2 +- src/templates/cluster.html | 22 ++++++++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/app.py b/src/app.py index bbad92b..a489029 100644 --- a/src/app.py +++ b/src/app.py @@ -80,12 +80,13 @@ def cluster(): if not appService.udt_pre_model: appService.config_udpipe(language_name) cluster_model_file = word2vec_language[language_name] - cluster_result, rec_cluster_result = appService.cluster_sentences( + cluster_result, rec_cluster_result, sentences, best_labels = appService.cluster_sentences( language_name, cluster_model_file, cluster_input_sentence, cluster_number) return render_template('cluster.html', cluster_number=cluster_number, cluster_result=cluster_result, - rec_cluster_result=rec_cluster_result) + rec_cluster_result=rec_cluster_result, + sentences_with_labels=zip(sentences, best_labels)) if __name__ == '__main__': diff --git a/src/service.py b/src/service.py index 65ed72a..be48dda 100644 --- a/src/service.py +++ b/src/service.py @@ -166,7 +166,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[ if no_n_input: examples = recommend_sentences - return examples, recommend_sentences + return examples, recommend_sentences, sentences, best_labels def kwic(self, selword: str, sentence_with_pos: list): """ diff --git a/src/templates/cluster.html b/src/templates/cluster.html index 7649fa2..2748b40 100644 --- a/src/templates/cluster.html +++ b/src/templates/cluster.html @@ -60,6 +60,28 @@

Well done!

After clustering, you get {{cluster_number}} example +
+ +
    + {% if cluster_result %} + {% for cluster_sentence, label in sentences_with_labels %} +
  • + {{cluster_sentence}} + {{label}} +
  • + {% endfor %} + {% endif %} +
+

+ +
+

From 16d235538c7e4d496713c1b0de5c11e0b75e692b Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Fri, 23 Apr 2021 22:38:06 +0800 Subject: [PATCH 04/23] KWIC-finish --- src/service.py | 9 +++++++-- src/templates/result.html | 1 - 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/service.py b/src/service.py index 65ed72a..640530b 100644 --- a/src/service.py +++ b/src/service.py @@ -17,6 +17,7 @@ from src.train.train_cluster import load_model from src.train.train_model import UdpipeTrain from src.train.cluster import Evaluator +import re try: store_data = StoreData(db_config['user'], @@ -186,7 +187,7 @@ def kwic(self, selword: str, sentence_with_pos: list): sents_origin = sentTuple[2] for sent in sents_origin: words = sent.split(" ") - words2 = self._get_keyword_window(selword, words) + words2 = self._get_keyword_window(selword, words, 9) sents_kwic.append(" ".join(words2)) return result @@ -204,7 +205,11 @@ def _get_keyword_window(self, sel_word: str, words_of_sentence: List, length=5) """ if length <= 0 or len(words_of_sentence) <= length: return words_of_sentence - index = words_of_sentence.index(sel_word) + index = -1 + for iw, word in enumerate(words_of_sentence): + if len(re.findall(sel_word, word)) > 0: + index = iw + if index == -1: print("warning: cannot find %s in sentence: %s".format(sel_word, words_of_sentence)) return words_of_sentence diff --git a/src/templates/result.html b/src/templates/result.html index 060104a..29a7c71 100644 --- a/src/templates/result.html +++ b/src/templates/result.html @@ -23,7 +23,6 @@

Well done!

You successfully find {{ item.caption }}--> {% endfor %} {% endif %} From 0d0b122c70bf4668531292dfc8cf77098cba9610 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Fri, 23 Apr 2021 23:03:51 +0800 Subject: [PATCH 05/23] app.py --- src/service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/service.py b/src/service.py index 02b37bb..bb5816e 100644 --- a/src/service.py +++ b/src/service.py @@ -207,11 +207,12 @@ def _get_keyword_window(self, sel_word: str, words_of_sentence: List, length=5) return words_of_sentence index = -1 for iw, word in enumerate(words_of_sentence): + word = word.lower() if len(re.findall(sel_word, word)) > 0: index = iw if index == -1: - print("warning: cannot find %s in sentence: %s".format(sel_word, words_of_sentence)) + print("warning: cannot find %s in sentence: %s" % (sel_word, words_of_sentence)) return words_of_sentence # backward is not enough if index < length // 2: From 88a2f6cd161b9de70ecde654c2ebb8491659a8db Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Fri, 23 Apr 2021 23:04:27 +0800 Subject: [PATCH 06/23] cluster show --- src/app.py | 4 +++- src/templates/cluster.html | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/app.py b/src/app.py index a489029..bda2893 100644 --- a/src/app.py +++ b/src/app.py @@ -58,9 +58,10 @@ def find2(): if not appService.udt_pre_model: appService.config_udpipe(language_name) appService.find_service(language_name, sel_word) + sel_result_kwic = appService.kwic(sel_word, appService.sel_result) return render_template('result.html', input_data={"language_name": language_name, "sel_word": sel_word, - "sel_result": appService.sel_result}) + "sel_result": sel_result_kwic}) @app.route('/cluster', methods=['POST']) @@ -76,6 +77,7 @@ def cluster(): language_name = request.form['languageName'] cluster_number = request.form['clusterNumber'] sel_tag = request.form['tagInput1'] + # TODO: clicking the button of return previous page then clicking cluster button causes a bug cluster_input_sentence = appService.pos_dict[sel_tag] if not appService.udt_pre_model: appService.config_udpipe(language_name) diff --git a/src/templates/cluster.html b/src/templates/cluster.html index 2748b40..fc7b304 100644 --- a/src/templates/cluster.html +++ b/src/templates/cluster.html @@ -35,8 +35,6 @@

Well done!

After clustering, you get {{cluster_number}} example {% endfor %} {% endif %} - -
@@ -73,7 +71,7 @@

Well done!

After clustering, you get {{cluster_number}} example {% for cluster_sentence, label in sentences_with_labels %}
  • {{cluster_sentence}} - {{label}} + {{label}}
  • {% endfor %} {% endif %} From 694faecf2f0efbd4cd381078ae4177312483e9d1 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Sat, 24 Apr 2021 08:30:21 +0800 Subject: [PATCH 07/23] sprint of new features --- README.md | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2b647c1..0f04373 100644 --- a/README.md +++ b/README.md @@ -222,8 +222,7 @@ Right now I found our repository has a problem considerable us to pay enough att such as file path of train corpus, the file path of cluster model, the file path of database config. These file paths cannot be pushed to our base repository! -We should think of a nice way to solve this issue. And I have an idea. We should maintain a common file relative path and all data files and config data should be put inside it. Also, there's another important thing to remember: don't -push these corpus and pre-train models to our base repository. We should maintain a common remote disk to store and then open and share a link to provide everyone in our group to use. +We should think of a nice way to solve this issue. And I have an idea. We should maintain a common file relative path and all data files and config data should be put inside it. Also, there's another important thing to remember: don't push these corpus and pre-train models to our base repository. We should maintain a common remote disk to store and then open and share a link to provide everyone in our group to use. I have created a file named input, there are three files inside it: corpus, udpipemodel, and word2vecmodel. All files in them are hosted at @@ -231,3 +230,39 @@ download: https://pan.baidu.com/s/14RzwuGjTZwsUhiyVSe-Pgg password: td3e downloading them and put them on root directory of wordfiner folder + + + +### Features + +Beta version supports features: + +1. Support query in 10 + languages +2. Support to select a certain language, input corresponding words, and display multiple parts of speech of words +3. Click a part of speech of the word to be looked up to show all the corresponding examples +4. Use KWIC to show examples +5. Support to input different number of clusters +6. Click cluster sentences to get examples containing words +7. Examples showing all words are supported + + + +Update features: + +1. KWIC, in the middle of the line + +2. now only show part sentence, it's better to show the whole sentence when click. + + a point on the bank hidden by brush where + +3. in cluster web interface, we should group the sentences as cluster labels, sorting. + +4. .gitignore files + +5. French clustering 3: + + ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive) + + Chinese + +6. there are bugs of cluster function \ No newline at end of file From 0dab771aad323559cfeb5c6cce805044231e7ce5 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Sat, 24 Apr 2021 12:34:59 +0800 Subject: [PATCH 08/23] update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5e8ba84..f071fdd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ .idea input __pycache__ +psd_project.sql +log/ From 91891fc1ab64ed4db5408cb70504e199733ef66e Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Sat, 24 Apr 2021 12:36:01 +0800 Subject: [PATCH 09/23] fix a score bug --- src/train/cluster.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/train/cluster.py b/src/train/cluster.py index d37ad37..dd60bcc 100644 --- a/src/train/cluster.py +++ b/src/train/cluster.py @@ -55,6 +55,8 @@ def higher_better_score(self, labels): """ higher value means better cluster result """ + if labels.min() == labels.max(): + return 1.0 return metrics.silhouette_score(self.X, labels, metric='euclidean') def nearer_zero_better_score(self, labels): From cce031dc0de147c1876c1be67edab44e1503c147 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Sat, 24 Apr 2021 17:03:36 +0800 Subject: [PATCH 10/23] merge bert cluster model and corresponding update --- src/service.py | 55 ++++++--------------------------------- src/train/bert_cluster.py | 41 +++++++++++++++++++++++++++++ src/util.py | 49 ++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 47 deletions(-) create mode 100644 src/train/bert_cluster.py diff --git a/src/service.py b/src/service.py index bb5816e..c7bc701 100644 --- a/src/service.py +++ b/src/service.py @@ -13,7 +13,12 @@ from src.train.result_model import TResult from src.train.store import StoreData -from src.util import (language_dict, language_list, db_config, corpus_language, udpipe_language) +from src.util import (language_dict, + language_list, + db_config, + corpus_language, + udpipe_language, + get_keyword_window) from src.train.train_cluster import load_model from src.train.train_model import UdpipeTrain from src.train.cluster import Evaluator @@ -129,7 +134,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[ words = self.udt_pre_model.word_segmentation(sent) word_vectors = [] # iterator to word - window_words = self._get_keyword_window(self.sel_result[0][0], words, 5) + window_words = get_keyword_window(self.sel_result[0][0], words, 5) for word in window_words: if word in word2vec_model.wv: word_vectors.append(word2vec_model.wv[word]) @@ -187,55 +192,11 @@ def kwic(self, selword: str, sentence_with_pos: list): sents_origin = sentTuple[2] for sent in sents_origin: words = sent.split(" ") - words2 = self._get_keyword_window(selword, words, 9) + words2 = get_keyword_window(selword, words, 9) sents_kwic.append(" ".join(words2)) return result - def _get_keyword_window(self, sel_word: str, words_of_sentence: List, length=5) -> List[str]: - """ - find the index of sel_word at sentence, then decide words of @length size - by backward and forward of it. - For example: I am very happy to this course of psd if sel_word is happy, then - returning: [am, very, happy, to, this] - - if length is even, then returning [very, happy, to, this] - - remember: sel_word is lemmatized - """ - if length <= 0 or len(words_of_sentence) <= length: - return words_of_sentence - index = -1 - for iw, word in enumerate(words_of_sentence): - word = word.lower() - if len(re.findall(sel_word, word)) > 0: - index = iw - - if index == -1: - print("warning: cannot find %s in sentence: %s" % (sel_word, words_of_sentence)) - return words_of_sentence - # backward is not enough - if index < length // 2: - back_slice = words_of_sentence[:index] - # forward is also not enough, - # showing the sentence is too short compared to length parameter - if (length - index) >= len(words_of_sentence): - return words_of_sentence - else: - return back_slice + words_of_sentence[index: index + length - len(back_slice)] - # forward is not enough - if (index + length // 2) >= len(words_of_sentence): - forward_slice = words_of_sentence[index:len(words_of_sentence)] - # backward is also not enough, - # showing the sentence is too short compared to length parameter - if index - length <= 0: - return words_of_sentence - else: - return words_of_sentence[index - (length - len(forward_slice)):index] + forward_slice - - return words_of_sentence[index - length // 2: index + length // 2 + 1] if length % 2 \ - else words_of_sentence[index - length // 2 + 1: index + length // 2 + 1] - def _get_examples(self, sentences: List[str], best_labels, n_clusters: int): tmp_labels, examples = [], [] for sent, label in zip(sentences, best_labels): diff --git a/src/train/bert_cluster.py b/src/train/bert_cluster.py new file mode 100644 index 0000000..3ac1b4f --- /dev/null +++ b/src/train/bert_cluster.py @@ -0,0 +1,41 @@ +import en_trf_bertbaseuncased_lg +import nltk +from nltk.cluster import KMeansClusterer +import pandas as pd +from typing import List +from src.util import get_keyword_window + + +def bert_en(select_word, sentences: List[str]): + nlp = en_trf_bertbaseuncased_lg.load() + sents_vectors = [] + for sent in sentences: + words = sent.split(' ') + sent2 = get_keyword_window(select_word, words, length=10) + sent2 = ' '.join(sent2) + sent_vect = nlp(sent2).vector + sents_vectors.append(sent_vect) + print(sents_vectors) + return sents_vectors + + +def clustering_question(sents, sents_word2vec, NUM_CLUSTERS=15): + kclusterer = KMeansClusterer( + NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, + repeats=25, avoid_empty_clusters=True) + + assigned_clusters = kclusterer.cluster(sents_word2vec, assign_clusters=True) + data = pd.DataFrame([], columns=['text', 'cluster', 'centroid']) + data.loc[:, 'text'] = sents + data.loc[:, 'cluster'] = pd.Series(assigned_clusters, index=data.index) + data.loc[:, 'centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x]) + + return data, assigned_clusters + + +if __name__ == "__main__": + sents = ['In 222 BC, the Romans besieged Acerrae, an Insubre fortification on the right bank of the River Adda between Cremona and Laus Pompeia (Lodi Vecchio).', + 'A spokesman for the bank said "We will be compensating customers who did not receive full services from Affinion, and providing our apology."', + 'One of the first fully functional direct banks in the United States was the Security First Network Bank (SFNB), which was launched in October 1995, and was the first direct bank to be insured by the Federal Deposit Insurance Corporation.'] + sents_vectors = bert_en('bank', sents) + clustering_question(sents, sents_vectors, 2) diff --git a/src/util.py b/src/util.py index e3cf167..a803f3b 100644 --- a/src/util.py +++ b/src/util.py @@ -2,6 +2,10 @@ # user: zhenguo # date: 2020.2.28 +from typing import List +import re + + # TODO: keeping update language_list = ['Chinese', 'English', 'French', 'Italian', 'Spanish', 'Korean', 'Russian', 'Portuguese'] language_dict = {'1': 'Chinese', '2': 'English', '3': 'French', '4': 'Italian', @@ -42,3 +46,48 @@ 'Russian': 'input//word2vecmodel//gensim-word2vec-model-Russian', 'Portuguese': 'input//word2vecmodel//gensim-word2vec-model-Portuguese'} + +def get_keyword_window(sel_word: str, words_of_sentence: List, length=5) -> List[str]: + """ + find the index of sel_word at sentence, then decide words of @length size + by backward and forward of it. + For example: I am very happy to this course of psd if sel_word is happy, then + returning: [am, very, happy, to, this] + + if length is even, then returning [very, happy, to, this] + + remember: sel_word is lemmatized + """ + if length <= 0 or len(words_of_sentence) <= length: + return words_of_sentence + index = -1 + for iw, word in enumerate(words_of_sentence): + word = word.lower() + if len(re.findall(sel_word, word)) > 0: + index = iw + + if index == -1: + print("warning: cannot find %s in sentence: %s" % (sel_word, words_of_sentence)) + return words_of_sentence + # backward is not enough + if index < length // 2: + back_slice = words_of_sentence[:index] + # forward is also not enough, + # showing the sentence is too short compared to length parameter + if (length - index) >= len(words_of_sentence): + return words_of_sentence + else: + return back_slice + words_of_sentence[index: index + length - len(back_slice)] + # forward is not enough + if (index + length // 2) >= len(words_of_sentence): + forward_slice = words_of_sentence[index:len(words_of_sentence)] + # backward is also not enough, + # showing the sentence is too short compared to length parameter + if index - length <= 0: + return words_of_sentence + else: + return words_of_sentence[index - (length - len(forward_slice)):index] + forward_slice + + return words_of_sentence[index - length // 2: index + length // 2 + 1] if length % 2 \ + else words_of_sentence[index - length // 2 + 1: index + length // 2 + 1] + From e346d0296cf687ea79306c999d80f0a96eec2e98 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Sat, 24 Apr 2021 17:38:04 +0800 Subject: [PATCH 11/23] cluster count and sentence count --- src/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/service.py b/src/service.py index c7bc701..cd3c83a 100644 --- a/src/service.py +++ b/src/service.py @@ -117,7 +117,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[ if n_clusters <= 0: print("Parameter is Invalid") return - if n_clusters > len(sentences): + if n_clusters >= len(sentences): # TODO add log print('number of cluster bigger than sentences count') return From bf54b9aea467335054683a9fee2e07ace2fc0b07 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Sat, 24 Apr 2021 18:34:31 +0800 Subject: [PATCH 12/23] cluster condition --- src/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/app.py b/src/app.py index bda2893..97e5a11 100644 --- a/src/app.py +++ b/src/app.py @@ -84,6 +84,8 @@ def cluster(): cluster_model_file = word2vec_language[language_name] cluster_result, rec_cluster_result, sentences, best_labels = appService.cluster_sentences( language_name, cluster_model_file, cluster_input_sentence, cluster_number) + if not cluster_result: + print("invalid input to cluster number") return render_template('cluster.html', cluster_number=cluster_number, cluster_result=cluster_result, From 42a64408d5a5dd79d95fbd42b2d788a79d7bd01f Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Sat, 24 Apr 2021 21:08:53 +0800 Subject: [PATCH 13/23] new feature:flash message prompt --- src/app.py | 42 ++++++++++++++++++++++++--------------- src/service.py | 14 +++++++++---- src/templates/layout.html | 20 ++++++++++++++++++- src/train/cluster.py | 5 +++++ 4 files changed, 60 insertions(+), 21 deletions(-) diff --git a/src/app.py b/src/app.py index 97e5a11..f4e8434 100644 --- a/src/app.py +++ b/src/app.py @@ -5,14 +5,15 @@ from src.train.result_model import TResult from src.train.store import StoreData from src.util import language_dict, language_list, db_config, word2vec_language -from src.service import AppService +from src.service import AppService, AppContext from flask import Flask, render_template, request, redirect, url_for, flash - app = Flask(__name__) +app.secret_key = b'_5#y2L"F4Q8z\n\xec]/' # TODO: need to change with the selection different language -appService = AppService() +app_service = AppService() +app_context = AppContext @app.route('/') @@ -40,10 +41,13 @@ def find(): language_id = request.form['sellanguage'] sel_word = request.form['selword'] language_name = language_dict[language_id] - if not appService.udt_pre_model: - appService.config_udpipe(language_name) - appService.find_service(language_name, sel_word) - sel_result_kwic = appService.kwic(sel_word, appService.sel_result) + app_context.sel_word = sel_word + app_context.sel_language = language_name + if not app_service.udt_pre_model: + app_service.config_udpipe(language_name) + app_service.find_service(language_name, sel_word) + sel_result_kwic = app_service.kwic(sel_word, app_service.sel_result) + app_context.sel_result_kwic = sel_result_kwic return render_template('result.html', input_data={"language_name": language_name, "sel_word": sel_word, "sel_result": sel_result_kwic}) @@ -55,10 +59,13 @@ def find2(): if request.method == 'POST': language_name = request.form['sellanguage'] sel_word = request.form['selword'] - if not appService.udt_pre_model: - appService.config_udpipe(language_name) - appService.find_service(language_name, sel_word) - sel_result_kwic = appService.kwic(sel_word, appService.sel_result) + app_context.sel_word = sel_word + app_context.sel_language = language_name + if not app_service.udt_pre_model: + app_service.config_udpipe(language_name) + app_service.find_service(language_name, sel_word) + sel_result_kwic = app_service.kwic(sel_word, app_service.sel_result) + app_context.sel_result_kwic = sel_result_kwic return render_template('result.html', input_data={"language_name": language_name, "sel_word": sel_word, "sel_result": sel_result_kwic}) @@ -78,14 +85,17 @@ def cluster(): cluster_number = request.form['clusterNumber'] sel_tag = request.form['tagInput1'] # TODO: clicking the button of return previous page then clicking cluster button causes a bug - cluster_input_sentence = appService.pos_dict[sel_tag] - if not appService.udt_pre_model: - appService.config_udpipe(language_name) + cluster_input_sentence = app_service.pos_dict[sel_tag] + if not app_service.udt_pre_model: + app_service.config_udpipe(language_name) cluster_model_file = word2vec_language[language_name] - cluster_result, rec_cluster_result, sentences, best_labels = appService.cluster_sentences( + cluster_result, rec_cluster_result, sentences, best_labels = app_service.cluster_sentences( language_name, cluster_model_file, cluster_input_sentence, cluster_number) if not cluster_result: - print("invalid input to cluster number") + flash("invalid input to cluster number") + return render_template('result.html', input_data={"language_name": language_name, + "sel_word": app_context.sel_word, + "sel_result": app_context.sel_result_kwic}) return render_template('cluster.html', cluster_number=cluster_number, cluster_result=cluster_result, diff --git a/src/service.py b/src/service.py index 793de12..4bd73e9 100644 --- a/src/service.py +++ b/src/service.py @@ -116,14 +116,14 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[ n_clusters = int(n_clusters) if n_clusters <= 0: print("Parameter is Invalid") - return - if n_clusters >= len(sentences): + return [None]*4 + if n_clusters > len(sentences): # TODO add log print('number of cluster bigger than sentences count') - return + return [None]*4 if len(self.sel_result) <= 0: print('no sentence') - return + return [None]*4 # first loading model word2vec_model = load_model(save_path) # second geting vectors for one sentence @@ -225,6 +225,12 @@ def _get_examples(self, sentences: List[str], best_labels, n_clusters: int): return examples +class AppContext(object): + sel_language = None + sel_word = None + sel_result_kwic = None + + if __name__ == "__main__": # get word vector for one sentence language_name = 'English' diff --git a/src/templates/layout.html b/src/templates/layout.html index 3a2c7b5..3692ac3 100644 --- a/src/templates/layout.html +++ b/src/templates/layout.html @@ -20,6 +20,18 @@
    + {% block content %} +
    + {% for message in get_flashed_messages() %} +
    + + {{ message }} +
    + {% endfor %} + + {% block page_content %}{% endblock %} +
    + {% endblock %}
    {% block body %} @@ -28,6 +40,12 @@ - s + + \ No newline at end of file diff --git a/src/train/cluster.py b/src/train/cluster.py index dd60bcc..b5f90ca 100644 --- a/src/train/cluster.py +++ b/src/train/cluster.py @@ -8,6 +8,7 @@ from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import DBSCAN +import numpy as np class Evaluator(object): @@ -55,8 +56,12 @@ def higher_better_score(self, labels): """ higher value means better cluster result """ + # only one cluster if labels.min() == labels.max(): return 1.0 + # cluster count equals to len of X + if len(np.unique(labels)) == len(self.X): + return 1.0 return metrics.silhouette_score(self.X, labels, metric='euclidean') def nearer_zero_better_score(self, labels): From feb2e6c6e4a67dc951ba8bf5eb36b72269552d72 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Tue, 27 Apr 2021 00:58:52 +0800 Subject: [PATCH 14/23] KIWC integration to js and html --- src/service.py | 8 +- src/static/js/main.js | 7 +- src/train/KWIC.py | 258 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 269 insertions(+), 4 deletions(-) create mode 100644 src/train/KWIC.py diff --git a/src/service.py b/src/service.py index 4bd73e9..9fd1a50 100644 --- a/src/service.py +++ b/src/service.py @@ -23,6 +23,7 @@ from src.train.train_model import UdpipeTrain from src.train.cluster import Evaluator import re +from src.train.KWIC import keywords_in_context, find_and_replace try: store_data = StoreData(db_config['user'], @@ -201,9 +202,10 @@ def kwic(self, selword: str, sentence_with_pos: list): sents_origin = sentTuple[2] for sent in sents_origin: - words = sent.split(" ") - words2 = get_keyword_window(selword, words, 9) - sents_kwic.append(" ".join(words2)) + result_text = keywords_in_context(sent, [selword]) + # Highlight Keywords + # result_text = find_and_replace(result_text, selword, "\x1b[34m" + selword + "\x1b[0m") + sents_kwic.append(result_text) return result diff --git a/src/static/js/main.js b/src/static/js/main.js index a7819ac..aefcc54 100644 --- a/src/static/js/main.js +++ b/src/static/js/main.js @@ -21,11 +21,16 @@ function findByTag(selWord, tag, rowResult, wordResultKWIC){ $('#clusterDiv1').show(); } for(i=1; i" - + "

    " + "..." + wordResultKWIC[i-1] + "..." +"

    " + + + "

    " + part1 + "" + part2 + "" + part3 + "

    " + ""+i+""+ ""; ulControl.append(ulcontent); + } } diff --git a/src/train/KWIC.py b/src/train/KWIC.py new file mode 100644 index 0000000..0e94c67 --- /dev/null +++ b/src/train/KWIC.py @@ -0,0 +1,258 @@ +def getNGrams(wordlist, n): + return [wordlist[i:i + n] for i in range(len(wordlist) - (n - 1))] + +# Given a list of n-grams, return a dictionary of KWICs, +# indexed by keyword. + + +def nGramsToKWICDict(ngrams): + keyindex = len(ngrams[0]) // 2 + + kwicdict = {} + + for k in ngrams: + if k[keyindex] not in kwicdict: + kwicdict[k[keyindex]] = [k] + else: + kwicdict[k[keyindex]].append(k) + return kwicdict + + +# Given a KWIC, return a string that is formatted for +# pretty printing. + +def prettyPrintKWIC(kwic): + n = len(kwic) + keyindex = n // 2 + width = 10 + + outstring = ' '.join(kwic[:keyindex]).rjust(width * keyindex) + outstring += str(kwic[keyindex]).center(len(kwic[keyindex]) + 6) + outstring += ' '.join(kwic[(keyindex + 1):]) + + return outstring + + +def cut_to_sentence(text, keyword, keywordindex): + """ Cuts the sentence around a keyword out of the text + Arguments + ---------- + text : str + Text out of which the sentence should be extracted + keyword : str + Keyword in the sentence of the text + keywordindex: int + Index of the keyword in the text + Returns + ------- + Indices of of the sentence in the text and a string of the sentence + """ + # Strings after wich a point does not end a sentence + safe = ["Ms", "Mr", "Fr", "Hr", "Dipl", "B", "M", "Sc", "Dr", "Prof", + "Mo", "Mon", "Di", "Tu", "Tue", "Tues", "Mi", "Wed", "Do", "Th", + "Thu", "Thur", "Thurs", "Fr", "Fri", "Sa", "Sat", "So", "Sun", + "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + "str"] + + # Find beginning + rfind_results = [] + end_ = keywordindex + # Special Case "." + while True: + rfind_ = text.rfind(". ", 0, end_) + if not rfind_ == -1: + no_safe = False + for i, s in enumerate(safe): + if text[0:rfind_][::-1].find(s[::-1]) == 0: + end_ = rfind_ - len(s) + break + if i == len(safe)-1: + no_safe = True + if no_safe is True: + break + else: + break + rfind_results.append(rfind_) + + rfind_results.append(max([text.rfind(sentence_ending, 0, keywordindex) + for sentence_ending in ["! ", "? "]])) + + rfind_result = max(rfind_results) + if rfind_result == -1: + start = 0 + else: + start = rfind_result + 2 + + # Find ending + find_results = [] + start_ = keywordindex+len(keyword) + # Special Case "." + while True: + find_ = text.find(". ", start_) + if not find_ == -1: + no_safe = False + for i, s in enumerate(safe): + if text[0:find_][::-1].find(s[::-1]) == 0: + start_ = find_ + len(s) + break + if i == len(safe)-1: + no_safe = True + if no_safe is True: + break + else: + break + find_results.append(find_) + + find_results.extend([text.find(sentence_ending, keywordindex+len(keyword)) + for sentence_ending in ["! ", "? "]]) + find_results_bigger_neg_1 = [i for i in find_results if i >= 0] + if not find_results_bigger_neg_1: + end = len(text) + else: + end = min(find_results_bigger_neg_1) + 1 + + return list(range(start, end)), text[start:end] + + +def find_nth_occurrence(text, searchstr, nth=1, startindex=0): + """ + Finds the index of the nth occurence of a searchstr in the text starting + from the a given startindex. + """ + start = text.find(searchstr, startindex) + + if start == -1: + return len(text)-1 + + for i in range(nth-1): + find_index = text.find(searchstr, start+len(searchstr)) + if find_index == -1: + return len(text)-1 + else: + start = find_index + + return start + + +def rfind_nth_occurrence(text, searchstr, nth=1, endindex=None): + """ + Finds the index of the nth occurence of a searchstr in the text going + backwards from a given endindex. + """ + if endindex is None: + endindex = len(text) + + end = text.rfind(searchstr, 0, endindex) + + if end == -1: + return 0 + + for i in range(nth-1): + rfind_index = text.rfind(searchstr, 0, end) + if rfind_index == -1: + return 0 + else: + end = rfind_index + + return end + + +def keywords_in_context(text, keywords, max_words=5, sep="...", cut_sentences=True): + """ Returns the relevant context around keywords in a larger text. + Arguments + ---------- + text : str + Text which should be summerized around keywords. + keywords : list of str + Keywords whose context we want to extract out of the text. + max_words : int + Maximum number of words before und after a keyword if no sentence + beginning or ending occurs and cut_sentences is set. + sep : str + String wich represents skipped portions of the text in the result. + cut_sentences : bool + Set if the context around a keyword is cut at the beginning or end of + a sentence + Returns + ------- + Summarised text containing the keywords in context as string. + """ + indices_lst = [] + for k in keywords: + start = text.find(k) + while not start == -1: + indices_lst.append((k, start)) + start = text.find(k, start+len(k)) + + result_indices = set() + for index_tpl in indices_lst: + keyword, index = index_tpl + start = rfind_nth_occurrence(text, " ", nth=max_words+1, endindex=index) + if not start == 0: + start += 1 # +1 to Remove the first " " + end = find_nth_occurrence(text, " ", nth=max_words+1, startindex=index+len(keyword)) + if end == len(text)-1: + end += 1 + indices_of_text = set(range(start, end)) + if cut_sentences: + sentence_indices, _ = cut_to_sentence(text, keyword, index) + indices_of_text.intersection_update(set(sentence_indices)) + for i in indices_of_text: + result_indices.add(i) + + result_indices = list(result_indices) + result_indices.sort() + + result = "" + i_before = -1 + for _i, i in enumerate(result_indices): + if not (i-1) == i_before: + result += " " + sep + " " + text[i] + i_before = i + else: + result += text[i] + i_before = i + + # If the last word is not the end of the text add the sperator. + if _i == len(result_indices)-1: + if not i == len(text)-1: + result += " " + sep + + return result + + +def find_and_replace(text, find_str, replacement_str): + """ Find and replace a find_str with a replacement_str in text. """ + start = text.find(find_str) + offset = 0 + while start != -1: + # update the index compatible to the whole text + start = start + offset + + # replace (cut the original word out and insert the replacement) + text = text[:start] + replacement_str + text[start+len(find_str):] + + offset = start + len(replacement_str) + start = text[offset:].find(find_str) + + return text + + +if __name__ == "__main__": + """ + Text = Sentence which needs to be shrinked + Keyword = Searched word + """ + TEXTs = [ + 'In 222 BC, the Romans besieged Acerrae, an Insubre fortification on the right bank of the River Adda between Cremona and Laus Pompeia (Lodi Vecchio).', + 'A spokesman for the bank said "We will be compensating customers who did not receive full services from Affinion, and providing our apology."', + 'One of the first fully functional direct banks in the United States was the Security First Network Bank (SFNB), which was launched in October 1995', + 'At the same time, internet-only banks or "virtual banks" appeared.', + 'Arriving at the Douro, Wellesley was unable to cross the river because Soult\'s army had either destroyed or moved all the boats to the northern bank.'] + KEYWORDS = ['bank'] + for TEXT in TEXTs: + result_text = keywords_in_context(TEXT, KEYWORDS) + # Highlight Keywords + for k in KEYWORDS: + result_text = find_and_replace(result_text, k, "\x1b[34m"+k+"\x1b[0m") + print(result_text) \ No newline at end of file From d4218003a5d842c2a38c9806187dbe00f7c9714e Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Wed, 28 Apr 2021 08:40:45 +0800 Subject: [PATCH 15/23] KWIC-highlight all selected word --- src/static/js/main.js | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/static/js/main.js b/src/static/js/main.js index aefcc54..0c1738d 100644 --- a/src/static/js/main.js +++ b/src/static/js/main.js @@ -5,6 +5,16 @@ function init(){ $('#clusterDiv1').hide(); } +// find all indexes of selected word(substr) in sentence(str) +function searchSubStr(str,subStr){ + var positions = new Array(); + var pos = str.indexOf(subStr); + while(pos>-1){ + positions.push(pos); + pos = str.indexOf(subStr,pos+1); + } + return positions; +} function findByTag(selWord, tag, rowResult, wordResultKWIC){ /* @@ -21,16 +31,21 @@ function findByTag(selWord, tag, rowResult, wordResultKWIC){ $('#clusterDiv1').show(); } for(i=1; i" - + "

    " + part1 + "" + part2 + "" + part3 + "

    " + - ""+i+""+ - ""; + var allIndexes = searchSubStr(wordResultKWIC[i-1].toLowerCase(), selWord.toLowerCase()); + //var wordIndex = wordResultKWIC[i-1].toLowerCase().indexOf(selWord.toLowerCase()); + var ulcontent = "
  • "; + if(allIndexes.length > 0){ + wordResultKWIC[i-1] + } + for(let i=0; i < allIndexes.length; i++){ + var part1 = wordResultKWIC[i-1].slice(0,i) + var part2 = wordResultKWIC[i-1].slice(i, i + selWord.length) + var part3 = wordResultKWIC[i-1].slice(i + selWord.length, wordResultKWIC[i-1].length) + var j = i + selWord.length + var ulcontent = ulcontent + part1 + "" + part2 + "" + part3; + } + ulcontent += "

    "+i+"" + "
  • "; ulControl.append(ulcontent); - } } @@ -40,3 +55,5 @@ function findByTag(selWord, tag, rowResult, wordResultKWIC){ + + From a9d64b52aebd7c5fa8b282b3e9dc35ec6ff13e6d Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Thu, 29 Apr 2021 00:07:52 +0800 Subject: [PATCH 16/23] KWIC for selected word when appearing mulitple times --- src/static/js/main.js | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/static/js/main.js b/src/static/js/main.js index 0c1738d..48e16f7 100644 --- a/src/static/js/main.js +++ b/src/static/js/main.js @@ -35,17 +35,19 @@ function findByTag(selWord, tag, rowResult, wordResultKWIC){ //var wordIndex = wordResultKWIC[i-1].toLowerCase().indexOf(selWord.toLowerCase()); var ulcontent = "
  • "; if(allIndexes.length > 0){ - wordResultKWIC[i-1] - } - for(let i=0; i < allIndexes.length; i++){ - var part1 = wordResultKWIC[i-1].slice(0,i) - var part2 = wordResultKWIC[i-1].slice(i, i + selWord.length) - var part3 = wordResultKWIC[i-1].slice(i + selWord.length, wordResultKWIC[i-1].length) - var j = i + selWord.length - var ulcontent = ulcontent + part1 + "" + part2 + "" + part3; + var startIndex = 0; + for(let j=0; j < allIndexes.length; j++){ + var part1 = wordResultKWIC[i-1].slice(startIndex,allIndexes[j]) + var part2 = wordResultKWIC[i-1].slice(allIndexes[j], allIndexes[j] + selWord.length) + startIndex = allIndexes[j] + selWord.length + ulcontent = ulcontent + part1 + "" + part2 + ""; + } + if(startIndex < wordResultKWIC[i-1].length){ + ulcontent = ulcontent + wordResultKWIC[i-1].slice(startIndex, wordResultKWIC[i-1].length) + } + ulcontent += "

    "+i+"" + "
  • "; + ulControl.append(ulcontent); } - ulcontent += "

    "+i+"" + ""; - ulControl.append(ulcontent); } } From 789d4b62683708af332c84415ec1fe10317fb790 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Fri, 30 Apr 2021 01:38:38 +0800 Subject: [PATCH 17/23] KWIC3 --- src/service.py | 11 ++++++-- src/static/js/main.js | 61 +++++++++++++++++++++++++++++-------------- src/train/KWIC.py | 17 ++++++++++-- src/util.py | 28 ++++++++++++++++++++ 4 files changed, 94 insertions(+), 23 deletions(-) diff --git a/src/service.py b/src/service.py index 9fd1a50..3e3ebc6 100644 --- a/src/service.py +++ b/src/service.py @@ -24,6 +24,8 @@ from src.train.cluster import Evaluator import re from src.train.KWIC import keywords_in_context, find_and_replace +from src.util import get_keyword_window, kwic_show + try: store_data = StoreData(db_config['user'], @@ -202,10 +204,15 @@ def kwic(self, selword: str, sentence_with_pos: list): sents_origin = sentTuple[2] for sent in sents_origin: - result_text = keywords_in_context(sent, [selword]) + # result_text = keywords_in_context(sent, [selword]) # Highlight Keywords # result_text = find_and_replace(result_text, selword, "\x1b[34m" + selword + "\x1b[0m") - sents_kwic.append(result_text) + # sents_kwic.append(result_text) + window_words = get_keyword_window(selword, sent.split(" ")) + result_text = kwic_show(window_words, selword) + if result_text: + print(result_text) + sents_kwic.append(result_text) return result diff --git a/src/static/js/main.js b/src/static/js/main.js index 48e16f7..92f2644 100644 --- a/src/static/js/main.js +++ b/src/static/js/main.js @@ -16,6 +16,42 @@ function searchSubStr(str,subStr){ return positions; } +//function findByTag(selWord, tag, rowResult, wordResultKWIC){ +// /* +// selWord: selected word +// rowResult: sentences +// tag: POS +// +// */ +// $("#tagInput1").attr("value",tag); +// var ulControl = $('#sentencesGroup'); +// ulControl.find("li").remove(); +// if(wordResultKWIC.length > 0){ +// $('#labelId1').show(); +// $('#clusterDiv1').show(); +// } +// for(i=1; i

    "; +// if(allIndexes.length > 0){ +// var startIndex = 0; +// for(let j=0; j < allIndexes.length; j++){ +// var part1 = wordResultKWIC[i-1].slice(startIndex,allIndexes[j]) +// var part2 = wordResultKWIC[i-1].slice(allIndexes[j], allIndexes[j] + selWord.length) +// startIndex = allIndexes[j] + selWord.length +// ulcontent = ulcontent + part1 + "" + part2 + ""; +// } +// if(startIndex < wordResultKWIC[i-1].length){ +// ulcontent = ulcontent + wordResultKWIC[i-1].slice(startIndex, wordResultKWIC[i-1].length) +// } +// ulcontent += "

    "+i+"" + ""; +// ulControl.append(ulcontent); +// } +// } +// +//} + function findByTag(selWord, tag, rowResult, wordResultKWIC){ /* selWord: selected word @@ -30,26 +66,15 @@ function findByTag(selWord, tag, rowResult, wordResultKWIC){ $('#labelId1').show(); $('#clusterDiv1').show(); } + + outstr = '
    '
         for(i=1; i 

    "; - if(allIndexes.length > 0){ - var startIndex = 0; - for(let j=0; j < allIndexes.length; j++){ - var part1 = wordResultKWIC[i-1].slice(startIndex,allIndexes[j]) - var part2 = wordResultKWIC[i-1].slice(allIndexes[j], allIndexes[j] + selWord.length) - startIndex = allIndexes[j] + selWord.length - ulcontent = ulcontent + part1 + "" + part2 + ""; - } - if(startIndex < wordResultKWIC[i-1].length){ - ulcontent = ulcontent + wordResultKWIC[i-1].slice(startIndex, wordResultKWIC[i-1].length) - } - ulcontent += "

    "+i+"" + ""; - ulControl.append(ulcontent); - } + outstr += wordResultKWIC[i-1] + outstr += '
    ' } + outstr += '
    ' + ulControl.append(outstr); } @@ -57,5 +82,3 @@ function findByTag(selWord, tag, rowResult, wordResultKWIC){ - - diff --git a/src/train/KWIC.py b/src/train/KWIC.py index 0e94c67..82ae5ab 100644 --- a/src/train/KWIC.py +++ b/src/train/KWIC.py @@ -231,6 +231,7 @@ def find_and_replace(text, find_str, replacement_str): # replace (cut the original word out and insert the replacement) text = text[:start] + replacement_str + text[start+len(find_str):] + prettyPrintKWIC(text) offset = start + len(replacement_str) start = text[offset:].find(find_str) @@ -238,6 +239,18 @@ def find_and_replace(text, find_str, replacement_str): return text +def prettyPrintKWIC(kwic): + n = len(kwic) + keyindex = n // 2 + width = 1 + + outstring = ' '.join(kwic[:keyindex]).rjust(width*keyindex) + outstring += str(kwic[keyindex]).center(len(kwic[keyindex])+6) + outstring += ' '.join(kwic[(keyindex+1):]) + # print(outstring) + return outstring + + if __name__ == "__main__": """ Text = Sentence which needs to be shrinked @@ -251,8 +264,8 @@ def find_and_replace(text, find_str, replacement_str): 'Arriving at the Douro, Wellesley was unable to cross the river because Soult\'s army had either destroyed or moved all the boats to the northern bank.'] KEYWORDS = ['bank'] for TEXT in TEXTs: - result_text = keywords_in_context(TEXT, KEYWORDS) + result_text = keywords_in_context(TEXT, KEYWORDS, max_words=3, sep="") # Highlight Keywords for k in KEYWORDS: - result_text = find_and_replace(result_text, k, "\x1b[34m"+k+"\x1b[0m") + result_text = find_and_replace(result_text, k, k) print(result_text) \ No newline at end of file diff --git a/src/util.py b/src/util.py index 79eeec8..6944328 100644 --- a/src/util.py +++ b/src/util.py @@ -90,3 +90,31 @@ def get_keyword_window(sel_word: str, words_of_sentence: List, length=5) -> List return words_of_sentence[index - length // 2: index + length // 2 + 1] if length % 2 \ else words_of_sentence[index - length // 2 + 1: index + length // 2 + 1] + + +def kwic_show(words_of_sentence, sel_word, sum_sent_length=60, key_word_space=1): + sent = ' '.join(words_of_sentence) + key_index = sent.lower().index(sel_word.lower()) + if key_index != -1: + pre_kwic = sent[:key_index].rjust(sum_sent_length//2) + key_kwic = key_word_space*' ' + sel_word + key_word_space*' ' + post_kwic = sent[key_index+len(sel_word):] + sel_word_kwic = pre_kwic + key_kwic + post_kwic + return sel_word_kwic + return None + + +if __name__ == "__main__": + """ + Text = Sentence which needs to be shrinked + Keyword = Searched word + """ + texts = [ + 'In 222 BC, the Romans besieged Acerrae, an Insubre fortification on the right bank of the River Adda between Cremona and Laus Pompeia (Lodi Vecchio).', + 'A spokesman for the bank said "We will be compensating customers who did not receive full services from Affinion, and providing our apology."', + 'One of the first fully functional direct banks in the United States was the Security First Network bank (SFNB), which was launched in October 1995', + 'At the same time, internet-only banks or "virtual banks" appeared.', + 'Arriving at the Douro, Wellesley was unable to cross the river because Soult\'s army had either destroyed or moved all the boats to the northern bank.'] + for text in texts: + result = get_keyword_window('bank', text.split(' ')) + kwic_show(result, 'bank') \ No newline at end of file From 164a6ec049744a9e9f7c6773550fe571d7f25388 Mon Sep 17 00:00:00 2001 From: Zhen Guo Date: Mon, 10 May 2021 08:22:04 +0800 Subject: [PATCH 18/23] udp-kwic2 --- src/app.py | 2 +- src/util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/app.py b/src/app.py index f4e8434..5c11cfa 100644 --- a/src/app.py +++ b/src/app.py @@ -104,4 +104,4 @@ def cluster(): if __name__ == '__main__': - app.run(port=3000, debug=True) + app.run(port=3000, host='0.0.0.0') diff --git a/src/util.py b/src/util.py index 6944328..004cff7 100644 --- a/src/util.py +++ b/src/util.py @@ -14,8 +14,8 @@ # database config # cofig for local database db_config = {'user': 'root', - 'password': 'root@123', - 'db_host': 'localhost', + 'password': 'LhxGz102231', + 'db_host': '192.144.171.233', 'db_name': 'psd_project'} # language and corresponding file path of corpus From 4e9618df398bcb12060506f305e6dd5f1c790920 Mon Sep 17 00:00:00 2001 From: W J Evans IV <70174172+hvkone@users.noreply.github.com> Date: Mon, 10 May 2021 02:20:30 -0500 Subject: [PATCH 19/23] Add or update the Azure App Service build and deployment workflow config --- .../workflows/release-v1.0_psdwordfinder.yml | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .github/workflows/release-v1.0_psdwordfinder.yml diff --git a/.github/workflows/release-v1.0_psdwordfinder.yml b/.github/workflows/release-v1.0_psdwordfinder.yml new file mode 100644 index 0000000..200ebed --- /dev/null +++ b/.github/workflows/release-v1.0_psdwordfinder.yml @@ -0,0 +1,42 @@ +# Docs for the Azure Web Apps Deploy action: https://go.microsoft.com/fwlink/?linkid=2134798 +# More GitHub Actions for Azure: https://go.microsoft.com/fwlink/?linkid=2135048 + +name: Azure App Service - psdwordfinder(Production), Build and deploy Python app + +on: + push: + branches: + - release-v1.0 + +jobs: + build-and-deploy: + runs-on: windows-latest + + steps: + # checkout the repo + - name: 'Checkout Github Action' + uses: actions/checkout@master + + + - name: Set up Python version + uses: actions/setup-python@v1 + with: + python-version: '3.6' + + - name: Install Python dependencies + run: | + python -m venv env + .\env\Scripts\activate + pip install -r requirements.txt + + - name: Zip the application files + run: Compress-Archive .\* app.zip + + - name: Run Azure webapp deploy action using publish profile credentials + uses: azure/webapps-deploy@v2 + with: + app-name: psdwordfinder + slot-name: Production + publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_D5D80263DF9F47EDAABE89081ACFE5AF }} + package: '.\app.zip' + From c6eb2d22d675778eaed8a1c6de03589a7757cb22 Mon Sep 17 00:00:00 2001 From: W J Evans IV <70174172+hvkone@users.noreply.github.com> Date: Mon, 10 May 2021 02:28:47 -0500 Subject: [PATCH 20/23] Remove the Azure App Service build and deployment workflow config --- .../workflows/release-v1.0_psdwordfinder.yml | 42 ------------------- 1 file changed, 42 deletions(-) delete mode 100644 .github/workflows/release-v1.0_psdwordfinder.yml diff --git a/.github/workflows/release-v1.0_psdwordfinder.yml b/.github/workflows/release-v1.0_psdwordfinder.yml deleted file mode 100644 index 200ebed..0000000 --- a/.github/workflows/release-v1.0_psdwordfinder.yml +++ /dev/null @@ -1,42 +0,0 @@ -# Docs for the Azure Web Apps Deploy action: https://go.microsoft.com/fwlink/?linkid=2134798 -# More GitHub Actions for Azure: https://go.microsoft.com/fwlink/?linkid=2135048 - -name: Azure App Service - psdwordfinder(Production), Build and deploy Python app - -on: - push: - branches: - - release-v1.0 - -jobs: - build-and-deploy: - runs-on: windows-latest - - steps: - # checkout the repo - - name: 'Checkout Github Action' - uses: actions/checkout@master - - - - name: Set up Python version - uses: actions/setup-python@v1 - with: - python-version: '3.6' - - - name: Install Python dependencies - run: | - python -m venv env - .\env\Scripts\activate - pip install -r requirements.txt - - - name: Zip the application files - run: Compress-Archive .\* app.zip - - - name: Run Azure webapp deploy action using publish profile credentials - uses: azure/webapps-deploy@v2 - with: - app-name: psdwordfinder - slot-name: Production - publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_D5D80263DF9F47EDAABE89081ACFE5AF }} - package: '.\app.zip' - From e0d354c43467c780f0dcb2329eb918319f2a706e Mon Sep 17 00:00:00 2001 From: W J Evans IV <70174172+hvkone@users.noreply.github.com> Date: Mon, 10 May 2021 02:31:42 -0500 Subject: [PATCH 21/23] Add or update the Azure App Service build and deployment workflow config --- .../workflows/release-v1.0_psdwordfinder.yml | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/release-v1.0_psdwordfinder.yml diff --git a/.github/workflows/release-v1.0_psdwordfinder.yml b/.github/workflows/release-v1.0_psdwordfinder.yml new file mode 100644 index 0000000..efd482a --- /dev/null +++ b/.github/workflows/release-v1.0_psdwordfinder.yml @@ -0,0 +1,38 @@ +# Docs for the Azure Web Apps Deploy action: https://go.microsoft.com/fwlink/?linkid=2134798 +# More GitHub Actions for Azure: https://go.microsoft.com/fwlink/?linkid=2135048 + +name: Azure App Service - psdwordfinder(Production), Build and deploy Python app + +on: + push: + branches: + - release-v1.0 + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + + steps: + # checkout the repo + - name: 'Checkout Github Action' + uses: actions/checkout@master + + + - name: Set up Python version + uses: actions/setup-python@v1 + with: + python-version: '3.8' + + - name: Build using AppService-Build + uses: azure/appservice-build@v2 + with: + platform: python + platform-version: '3.8' + + - name: Run Azure webapp deploy action using publish profile credentials + uses: azure/webapps-deploy@v2 + with: + app-name: psdwordfinder + slot-name: Production + publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_BF7EE335A4184D289264B99B2441A8C1 }} + From b5b959f6bc1673ebad243fb4fc2a65a183bb4fb7 Mon Sep 17 00:00:00 2001 From: W J Evans IV <70174172+hvkone@users.noreply.github.com> Date: Mon, 10 May 2021 09:23:33 -0400 Subject: [PATCH 22/23] Remove the Azure App Service build and deployment workflow config --- .../workflows/release-v1.0_psdwordfinder.yml | 38 ------------------- 1 file changed, 38 deletions(-) delete mode 100644 .github/workflows/release-v1.0_psdwordfinder.yml diff --git a/.github/workflows/release-v1.0_psdwordfinder.yml b/.github/workflows/release-v1.0_psdwordfinder.yml deleted file mode 100644 index efd482a..0000000 --- a/.github/workflows/release-v1.0_psdwordfinder.yml +++ /dev/null @@ -1,38 +0,0 @@ -# Docs for the Azure Web Apps Deploy action: https://go.microsoft.com/fwlink/?linkid=2134798 -# More GitHub Actions for Azure: https://go.microsoft.com/fwlink/?linkid=2135048 - -name: Azure App Service - psdwordfinder(Production), Build and deploy Python app - -on: - push: - branches: - - release-v1.0 - -jobs: - build-and-deploy: - runs-on: ubuntu-latest - - steps: - # checkout the repo - - name: 'Checkout Github Action' - uses: actions/checkout@master - - - - name: Set up Python version - uses: actions/setup-python@v1 - with: - python-version: '3.8' - - - name: Build using AppService-Build - uses: azure/appservice-build@v2 - with: - platform: python - platform-version: '3.8' - - - name: Run Azure webapp deploy action using publish profile credentials - uses: azure/webapps-deploy@v2 - with: - app-name: psdwordfinder - slot-name: Production - publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_BF7EE335A4184D289264B99B2441A8C1 }} - From b0cd260dda7e291e14bc9c2d784d66f2e3dd5c3d Mon Sep 17 00:00:00 2001 From: W J Evans IV <70174172+hvkone@users.noreply.github.com> Date: Mon, 10 May 2021 13:22:48 -0500 Subject: [PATCH 23/23] Add or update the App Service deployment workflow configuration from Azure Portal. --- .../workflows/release-v1.0_psdwordfinder.yml | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 .github/workflows/release-v1.0_psdwordfinder.yml diff --git a/.github/workflows/release-v1.0_psdwordfinder.yml b/.github/workflows/release-v1.0_psdwordfinder.yml new file mode 100644 index 0000000..d46d59c --- /dev/null +++ b/.github/workflows/release-v1.0_psdwordfinder.yml @@ -0,0 +1,62 @@ +# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy +# More GitHub Actions for Azure: https://github.com/Azure/actions +# More info on Python, GitHub Actions, and Azure App Service: https://aka.ms/python-webapps-actions + +name: Build and deploy Python app to Azure Web App - psdwordfinder + +on: + push: + branches: + - release-v1.0 + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python version + uses: actions/setup-python@v1 + with: + python-version: '3.8' + + - name: Create and start virtual environment + run: | + python -m venv venv + source venv/bin/activate + + - name: Install dependencies + run: pip install -r requirements.txt + + # Optional: Add step to run tests here (PyTest, Django test suites, etc.) + + - name: Upload artifact for deployment jobs + uses: actions/upload-artifact@v2 + with: + name: python-app + path: | + . + !venv/ + + deploy: + runs-on: ubuntu-latest + needs: build + environment: + name: 'production' + url: ${{ steps.deploy-to-webapp.outputs.webapp-url }} + + steps: + - name: Download artifact from build job + uses: actions/download-artifact@v2 + with: + name: python-app + path: . + + - name: 'Deploy to Azure Web App' + uses: azure/webapps-deploy@v2 + with: + app-name: 'psdwordfinder' + slot-name: 'production' + publish-profile: ${{ secrets.AzureAppService_PublishProfile_fba125a4de7c454cbe8f4c98c4017480 }} \ No newline at end of file