hvkone · harisalam · Apr 23, 2021 · Apr 23, 2021 · Apr 23, 2021 · Apr 23, 2021
diff --git a/.github/workflows/release-v1.0_psdwordfinder.yml b/.github/workflows/release-v1.0_psdwordfinder.yml
@@ -0,0 +1,62 @@
+# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy
+# More GitHub Actions for Azure: https://github.com/Azure/actions
+# More info on Python, GitHub Actions, and Azure App Service: https://aka.ms/python-webapps-actions
+
+name: Build and deploy Python app to Azure Web App - psdwordfinder
+
+on:
+  push:
+    branches:
+      - release-v1.0
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python version
+      uses: actions/setup-python@v1
+      with:
+        python-version: '3.8'
+
+    - name: Create and start virtual environment
+      run: |
+        python -m venv venv
+        source venv/bin/activate
+
+    - name: Install dependencies
+      run: pip install -r requirements.txt
+
+    # Optional: Add step to run tests here (PyTest, Django test suites, etc.)
+
+    - name: Upload artifact for deployment jobs
+      uses: actions/upload-artifact@v2
+      with:
+        name: python-app
+        path: |
+          .
+          !venv/
+
+  deploy:
+    runs-on: ubuntu-latest
+    needs: build
+    environment:
+      name: 'production'
+      url: ${{ steps.deploy-to-webapp.outputs.webapp-url }}
+
+    steps:
+      - name: Download artifact from build job
+        uses: actions/download-artifact@v2
+        with:
+          name: python-app
+          path: .
+
+      - name: 'Deploy to Azure Web App'
+        uses: azure/webapps-deploy@v2
+        with:
+          app-name: 'psdwordfinder'
+          slot-name: 'production'
+          publish-profile: ${{ secrets.AzureAppService_PublishProfile_fba125a4de7c454cbe8f4c98c4017480 }}
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@
 .idea
 input
 __pycache__
+psd_project.sql
+log/
diff --git a/README.md b/README.md
@@ -222,8 +222,7 @@ Right now I found our repository has a problem considerable us to pay enough att
 
 such as file path of train corpus, the file path of cluster model, the file path of database config. These file paths cannot be pushed to our base repository! 
 
-We should think of a nice way to solve this issue. And I have an idea. We should maintain a common file relative path and all data files and config data should be put inside it. Also, there's another important thing to remember: don't
-push these corpus and pre-train models to our base repository. We should maintain a common remote disk to store and then open and share a link to provide everyone in our group to use.
+We should think of a nice way to solve this issue. And I have an idea. We should maintain a common file relative path and all data files and config data should be put inside it. Also, there's another important thing to remember: don't push these corpus and pre-train models to our base repository. We should maintain a common remote disk to store and then open and share a link to provide everyone in our group to use.
 
 I have created a file named input, there are three files inside it: corpus, udpipemodel, and word2vecmodel. All files in them are hosted at 
 
@@ -233,10 +232,36 @@ password: td3e
 downloading them and put them on root directory of wordfiner folder
 
 
-### sprint 5
+### Features
 
-1、database: we should build a remote DB @Willie
-2、word2vec: two methods of doing that @Zhen 
-3、we should label every sentence and show all sentences with a label to the cluster web interfaces @all
+Beta version supports features:
 
+1. Support query in 10 + languages
+2. Support to select a certain language, input corresponding words, and display multiple parts of speech of words
+3. Click a part of speech of the word to be looked up to show all the corresponding examples
+4. Use KWIC to show examples
+5. Support to input different number of clusters
+6. Click cluster sentences to get examples containing words
+7. Examples showing all words are supported
 
+
+
+Update features:
+
+1. KWIC, in the middle of the line
+
+2. now only show part sentence, it's better to show the whole sentence when click.
+
+   <a href="">a point on the bank hidden by brush where </a>
+
+3. in cluster web interface, we should group the sentences as cluster labels, sorting.
+
+4. .gitignore files 
+
+5. French clustering 3: 
+
+   ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
+
+   Chinese
+
+6.  there are bugs of cluster function
diff --git a/src/app.py b/src/app.py
@@ -5,14 +5,15 @@
 from src.train.result_model import TResult
 from src.train.store import StoreData
 from src.util import language_dict, language_list, db_config, word2vec_language
-from src.service import AppService
+from src.service import AppService, AppContext
 from flask import Flask, render_template, request, redirect, url_for, flash
 
-
 app = Flask(__name__)
+app.secret_key = b'_5#y2L"F4Q8z\n\xec]/'
 
 # TODO: need to change with the selection different language
-appService = AppService()
+app_service = AppService()
+app_context = AppContext
 
 
 @app.route('/')
@@ -40,12 +41,16 @@ def find():
         language_id = request.form['sellanguage']
         sel_word = request.form['selword']
         language_name = language_dict[language_id]
-        if not appService.udt_pre_model:
-            appService.config_udpipe(language_name)
-        appService.find_service(language_name, sel_word)
+        app_context.sel_word = sel_word
+        app_context.sel_language = language_name
+        if not app_service.udt_pre_model:
+            app_service.config_udpipe(language_name)
+        app_service.find_service(language_name, sel_word)
+        sel_result_kwic = app_service.kwic(sel_word, app_service.sel_result)
+        app_context.sel_result_kwic = sel_result_kwic
     return render_template('result.html', input_data={"language_name": language_name,
                                                       "sel_word": sel_word,
-                                                      "sel_result": appService.sel_result})
+                                                      "sel_result": sel_result_kwic})
 
 
 @app.route('/find2', methods=['POST'])
@@ -54,12 +59,16 @@ def find2():
     if request.method == 'POST':
         language_name = request.form['sellanguage']
         sel_word = request.form['selword']
-        if not appService.udt_pre_model:
-            appService.config_udpipe(language_name)
-        appService.find_service(language_name, sel_word)
+        app_context.sel_word = sel_word
+        app_context.sel_language = language_name
+        if not app_service.udt_pre_model:
+            app_service.config_udpipe(language_name)
+        app_service.find_service(language_name, sel_word)
+        sel_result_kwic = app_service.kwic(sel_word, app_service.sel_result)
+        app_context.sel_result_kwic = sel_result_kwic
     return render_template('result.html', input_data={"language_name": language_name,
                                                       "sel_word": sel_word,
-                                                      "sel_result": appService.sel_result})
+                                                      "sel_result": sel_result_kwic})
 
 
 @app.route('/cluster', methods=['POST'])
@@ -75,17 +84,24 @@ def cluster():
         language_name = request.form['languageName']
         cluster_number = request.form['clusterNumber']
         sel_tag = request.form['tagInput1']
-        cluster_input_sentence = appService.pos_dict[sel_tag]
-        if not appService.udt_pre_model:
-            appService.config_udpipe(language_name)
+        # TODO: clicking the button of return previous page then clicking cluster button causes a bug
+        cluster_input_sentence = app_service.pos_dict[sel_tag]
+        if not app_service.udt_pre_model:
+            app_service.config_udpipe(language_name)
         cluster_model_file = word2vec_language[language_name]
-        cluster_result, rec_cluster_result = appService.cluster_sentences(
+        cluster_result, rec_cluster_result, sentences, best_labels = app_service.cluster_sentences(
             language_name, cluster_model_file, cluster_input_sentence, cluster_number)
+        if not cluster_result:
+            flash("invalid input to cluster number")
+            return render_template('result.html', input_data={"language_name": language_name,
+                                                              "sel_word": app_context.sel_word,
+                                                              "sel_result": app_context.sel_result_kwic})
         return render_template('cluster.html',
                                cluster_number=cluster_number,
                                cluster_result=cluster_result,
-                               rec_cluster_result=rec_cluster_result)
+                               rec_cluster_result=rec_cluster_result,
+                               sentences_with_labels=zip(sentences, best_labels))
 
 
 if __name__ == '__main__':
-    app.run(port=3000, debug=True)
+    app.run(port=3000, host='0.0.0.0')
diff --git a/src/service.py b/src/service.py
@@ -13,10 +13,19 @@
 
 from src.train.result_model import TResult
 from src.train.store import StoreData
-from src.util import *
+from src.util import (language_dict,
+                      language_list,
+                      db_config,
+                      corpus_language,
+                      udpipe_language,
+                      get_keyword_window)
 from src.train.train_cluster import load_model
 from src.train.train_model import UdpipeTrain
 from src.train.cluster import Evaluator
+import re
+from src.train.KWIC import keywords_in_context, find_and_replace
+from src.util import get_keyword_window, kwic_show
+
 
 try:
     store_data = StoreData(db_config['user'],
@@ -110,14 +119,14 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[
         n_clusters = int(n_clusters)
         if n_clusters <= 0:
             print("Parameter is Invalid")
-            return
+            return [None]*4
         if n_clusters > len(sentences):
             # TODO add log
             print('number of cluster bigger than sentences count')
-            return
+            return [None]*4
         if len(self.sel_result) <= 0:
             print('no sentence')
-            return
+            return [None]*4
         # first loading model
         word2vec_model = load_model(save_path)
         # second geting vectors for one sentence
@@ -176,7 +185,36 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[
         if no_n_input:
             examples = recommend_sentences
 
-        return examples, recommend_sentences
+        return examples, recommend_sentences, sentences, best_labels
+
+    def kwic(self, selword: str, sentence_with_pos: list):
+        """
+        :param: selword
+        :param: sentenceWithPOS
+
+        sentence_with_pos examples:
+        [("NOUN", "bank", ["I go to the bank", "The house lies the right of the river bank"]),
+        ("VERB", "bank", ["I banked in a slot"])
+        """
+        # This is similar to sentenceWithPOS but processed after KWIC
+        result = []
+        for sentTuple in sentence_with_pos:
+            sents_kwic = []
+            result.append((sentTuple[0], sentTuple[1], sentTuple[2], sents_kwic))
+
+            sents_origin = sentTuple[2]
+            for sent in sents_origin:
+                # result_text = keywords_in_context(sent, [selword])
+                # Highlight Keywords
+                # result_text = find_and_replace(result_text, selword, "\x1b[34m" + selword + "\x1b[0m")
+                # sents_kwic.append(result_text)
+                window_words = get_keyword_window(selword, sent.split(" "))
+                result_text = kwic_show(window_words, selword)
+                if result_text:
+                    print(result_text)
+                    sents_kwic.append(result_text)
+
+        return result
 
     def _get_examples(self, sentences: List[str], best_labels, n_clusters: int):
         tmp_labels, examples = [], []
@@ -196,6 +234,12 @@ def _get_examples(self, sentences: List[str], best_labels, n_clusters: int):
         return examples
 
 
+class AppContext(object):
+    sel_language = None
+    sel_word = None
+    sel_result_kwic = None
+
+
 if __name__ == "__main__":
     # get word vector for one sentence
     language_name = 'English'

diff --git a/src/static/js/main.js b/src/static/js/main.js
@@ -5,33 +5,76 @@ function init(){
     $('#clusterDiv1').hide();
 }
 
+// find all indexes of selected word(substr) in sentence(str)
+function searchSubStr(str,subStr){
+    var positions = new Array();
+    var pos = str.indexOf(subStr);
+    while(pos>-1){
+        positions.push(pos);
+        pos = str.indexOf(subStr,pos+1);
+    }
+    return positions;
+}
+
+//function findByTag(selWord, tag, rowResult, wordResultKWIC){
+//  /*
+//    selWord: selected word
+//    rowResult: sentences
+//    tag: POS
+//
+//  */
+//    $("#tagInput1").attr("value",tag);
+//    var ulControl = $('#sentencesGroup');
+//    ulControl.find("li").remove();
+//    if(wordResultKWIC.length > 0){
+//        $('#labelId1').show();
+//        $('#clusterDiv1').show();
+//    }
+//    for(i=1; i<wordResultKWIC.length+1; i++){
+//       var allIndexes = searchSubStr(wordResultKWIC[i-1].toLowerCase(), selWord.toLowerCase());
+//       //var wordIndex = wordResultKWIC[i-1].toLowerCase().indexOf(selWord.toLowerCase());
+//       var ulcontent = "<li class=\"list-group-item d-flex justify-content-between align-items-center\"> <p>";
+//       if(allIndexes.length > 0){
+//           var startIndex = 0;
+//           for(let j=0; j < allIndexes.length; j++){
+//               var part1 = wordResultKWIC[i-1].slice(startIndex,allIndexes[j])
+//               var part2 = wordResultKWIC[i-1].slice(allIndexes[j], allIndexes[j] + selWord.length)
+//               startIndex = allIndexes[j] + selWord.length
+//               ulcontent = ulcontent + part1 + "<strong class=\"text-success\">" + part2 + "</strong>";
+//            }
+//            if(startIndex < wordResultKWIC[i-1].length){
+//               ulcontent = ulcontent + wordResultKWIC[i-1].slice(startIndex, wordResultKWIC[i-1].length)
+//            }
+//            ulcontent += "</p> <span class=\"badge badge-primary badge-pill\">"+i+"</span>" + "</li>";
+//            ulControl.append(ulcontent);
+//        }
+//    }
+//
+//}
 
-function findByTag(selWord, tag, rowResult){
+function findByTag(selWord, tag, rowResult, wordResultKWIC){
   /*
     selWord: selected word
     rowResult: sentences
     tag: POS
+
   */
     $("#tagInput1").attr("value",tag);
     var ulControl = $('#sentencesGroup');
     ulControl.find("li").remove();
-    var rowResult1 = rowResult;
-    if(rowResult1.length > 0){
+    if(wordResultKWIC.length > 0){
         $('#labelId1').show();
         $('#clusterDiv1').show();
     }
-    for(i=1; i<rowResult1.length+1; i++){
-       var wordIndex = rowResult1[i-1].toLowerCase().indexOf(selWord.toLowerCase());
-       var part1 = rowResult1[i-1].slice(0,wordIndex)
-       var part2 = rowResult1[i-1].slice(wordIndex, wordIndex + selWord.length+1)
-       var part3 = rowResult1[i-1].slice(wordIndex + selWord.length+1, rowResult1[i-1].length)
-       var ulcontent = "<li class=\"list-group-item d-flex justify-content-between align-items-center\">"
-                      + "<p>" + part1 + "<strong class=\"text-success\">" + part2 + "</strong>" + part3 + "</p>" +
-                      "<span class=\"badge badge-primary badge-pill\">"+i+"</span>"+
-                      "</li>";
-        ulControl.append(ulcontent);
+
+    outstr = '<pre>'
+    for(i=1; i<wordResultKWIC.length+1; i++){
+        outstr += wordResultKWIC[i-1]
+        outstr += '<br />'
     }
 
+    outstr += '</pre>'
+    ulControl.append(outstr);
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,5 @@ @@
     .idea
     input
     __pycache__
+    psd_project.sql
+    log/