hvkone · harisalam · Apr 23, 2021 · Apr 28, 2021 · Apr 29, 2021 · May 3, 2021
diff --git a/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe b/corpus/udpipemodel/english-ewt-ud-2.5-191206.udpipe
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,12 @@
 requests==2.25.1
 nltk==3.5
-numpy==1.20.1
+numpy~=1.19.2
 beautifulsoup4==4.9.3
 corpy==0.3.0
 Flask==1.1.2
 gensim==3.8.3
 pymysql==1.0.2
 scikit_learn==0.24.1
-pysolr-3.9.0
+pysolr-3.9.0
+mysql~=5.7.24
+scikit-learn~=0.24.1
diff --git a/src/app.py b/src/app.py
@@ -79,8 +79,7 @@ def cluster():
         if not appService.udt_pre_model:
             appService.config_udpipe(language_name)
         cluster_model_file = word2vec_language[language_name]
-        cluster_result, rec_cluster_result = appService.cluster_sentences(
-            language_name, cluster_model_file, cluster_input_sentence, cluster_number)
+        cluster_result, rec_cluster_result = appService.cluster_sentences(language_name, cluster_model_file,cluster_input_sentence, cluster_number)
         return render_template('cluster.html',
                                cluster_number=cluster_number,
                                cluster_result=cluster_result,

diff --git a/src/databaseClustering.py b/src/databaseClustering.py
@@ -1,4 +1,4 @@
-import mysql.connector
+import pymysql.connector
 from mysql.connector import errorcode
 from datetime import datetime
 import pandas as pd
@@ -10,115 +10,113 @@
 import string
 
 
-
 def train_model(language_name, corpus_path, save_path):
-
-	model = gensim.models.Word2Vec(sentences=corpus_path,
-								   size=150,
-								   window=8,
-								   min_count=2,
-								   workers=2,
-								   iter=10)
-	model.save(save_path + language_name)
-	print('Save succeed')
+    model = gensim.models.Word2Vec(sentences=corpus_path,
+                                   size=150,
+                                   window=8,
+                                   min_count=2,
+                                   workers=2,
+                                   iter=10)
+    model.save(save_path + language_name)
+    print('Save succeed')
 
 
 def load_model(save_path) -> gensim.models.Word2Vec:
-	filename = save_path
-	model = gensim.models.Word2Vec.load(filename)
-	print('Loading succeed')
-	for index, word in enumerate(model.wv.index2word):
-		if index == 5:
-			break
-		vec = ",".join(map(lambda i: str(i), model.wv[word]))
-		print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
-	return model
+    filename = save_path
+    model = gensim.models.Word2Vec.load(filename)
+    print('Loading succeed')
+    for index, word in enumerate(model.wv.index2word):
+        if index == 5:
+            break
+        vec = ",".join(map(lambda i: str(i), model.wv[word]))
+        print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
+    return model
+
 
 def database():
+    db = mysql.connector.connect(
+        host='localhost',
+        user='root',
+        password='root',
+        database='psd_project'
+    )
+    mycursor = db.cursor()
+    query_info = ("SELECT sentence FROM english_sentences")
+    mycursor.execute(query_info)
+    sentences_df = pd.DataFrame(mycursor.fetchall(), columns=['Sentences'])
+
+    return sentences_df
+
 
-	db = mysql.connector.connect(
-		host='localhost',
-		user='root',
-		password='root',
-		database='psd_project'
-		)
-	mycursor = db.cursor()
-	query_info = ("SELECT sentence FROM english_sentences")
-	mycursor.execute(query_info)
-	sentences_df= pd.DataFrame(mycursor.fetchall(), columns=['Sentences'])
-
-	return sentences_df
-
 def textProcessing(text):
-    no_stop =[words for words in text.split() if words.lower() not in string.punctuation]
+    no_stop = [words for words in text.split() if words.lower() not in string.punctuation]
     return no_stop
 
-def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int) :
-
-	n_clusters = int(n_clusters)
-	print("clusters are ",n_clusters)
-	if n_clusters <=0:
-		print("Parameter is Invalid")
-		return
-	if n_clusters > len(sentences):
-		# TODO add log
-		print('number of cluster bigger than sentences count')
-		return
-	# first loading model
-	word2vec_model = load_model(save_path)
-	# second geting vectors for one sentence
-	sent_vectors = []
-	default_dimn = 100
-	# iterator to sentence
-	for word1 in sentences:
-		print(word1)
-		word_vectors = []
-		for words in word1:
-		
-			if words in word2vec_model.wv:
-				word_vectors.append(word2vec_model.wv[words])
-			else:  # not in dict, fill 0
-				word_vectors.append([0] * default_dimn)
-
-	to_array = np.array(word_vectors)
-	sent_vectors.append(to_array.mean(axis=0).tolist())
-	kmeans = KMeans(n_clusters=n_clusters,random_state=0).fit(sent_vectors)
-	labels = kmeans.labels_
-	tmp_labels,examples = [],[]
-	for sent,label in zip(sentences,labels):
-		if label not in tmp_labels:
-			tmp_labels.append(label)
-			examples.append(sent)
-		if len(examples) == n_clusters:
-			break
-	# add bottom logic for cluster
-	if len(examples) < n_clusters:
-		for sent in sentences:
-			if sent not in examples:
-				examples.append(sent)
-			if len(examples) >= n_clusters:
-				break
-
-	return examples
+
+def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int):
+    n_clusters = int(n_clusters)
+    print("clusters are ", n_clusters)
+    if n_clusters <= 0:
+        print("Parameter is Invalid")
+        return
+    if n_clusters > len(sentences):
+        # TODO add log
+        print('number of cluster bigger than sentences count')
+        return
+    # first loading model
+    word2vec_model = load_model(save_path)
+    # second geting vectors for one sentence
+    sent_vectors = []
+    default_dimn = 100
+    # iterator to sentence
+    for word1 in sentences:
+        print(word1)
+        word_vectors = []
+        for words in word1:
+
+            if words in word2vec_model.wv:
+                word_vectors.append(word2vec_model.wv[words])
+            else:  # not in dict, fill 0
+                word_vectors.append([0] * default_dimn)
+
+    to_array = np.array(word_vectors)
+    sent_vectors.append(to_array.mean(axis=0).tolist())
+    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sent_vectors)
+    labels = kmeans.labels_
+    tmp_labels, examples = [], []
+    for sent, label in zip(sentences, labels):
+        if label not in tmp_labels:
+            tmp_labels.append(label)
+            examples.append(sent)
+        if len(examples) == n_clusters:
+            break
+    # add bottom logic for cluster
+    if len(examples) < n_clusters:
+        for sent in sentences:
+            if sent not in examples:
+                examples.append(sent)
+            if len(examples) >= n_clusters:
+                break
+
+    return examples
 
 
 a = database()
-file_path= r'C:\Users\haris\Desktop\wordFinder\word2vec'
+file_path = r'C:\Users\haris\Desktop\wordFinder\word2vec'
 file_path = file_path + 'English'
 load_model(file_path)
 print('All done')
 
-c=a['Sentences'].apply(textProcessing)
+c = a['Sentences'].apply(textProcessing)
 
 # get word vector for one sentence
 language_name = 'English'
 sentences = [
-	'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
-	'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
-	'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
-	'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']
+    'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
+    'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
+    'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
+    'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']
 
-cluster_result = cluster_sentences(langage_name, file_path,c,3)
+cluster_result = cluster_sentences(language_name, file_path, c, 3)
 print("two examples sentences: \n")
 print(cluster_result)
-
diff --git a/src/service.py b/src/service.py
@@ -79,7 +79,7 @@ def database(self):
                                     db_host=db_config['db_host'],
                                     db_name=db_config['db_name'])
         self.cursor = self.store_data.db_connect().cursor()
-        query_info = "SELECT sentence FROM english_sentences"
+        query_info = "SELECT sentence FROM English_sentences"
         self.cursor.execute(query_info)
         sentences_df = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
         return sentences_df
@@ -90,7 +90,7 @@ def clusteringData(self):
                                     db_host=db_config['db_host'],
                                     db_name=db_config['db_name'])
         self.cursor = self.store_data.db_connect().cursor()
-        query_info = "SELECT sentence FROM english_sentences"
+        query_info = "SELECT sentence FROM English_sentences"
         self.cursor.execute(query_info)
         sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
         return sentences_dataframe
@@ -129,7 +129,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[
             words = self.udt_pre_model.word_segmentation(sent)
             word_vectors = []
             # iterator to word
-            window_words = get_keyword_window(self.sel_result[0][0], words, 5)
+            window_words = get_keyword_window(self.sel_result[0][0], words, 10)
             for word in window_words:
                 if word in word2vec_model.wv:
                     word_vectors.append(word2vec_model.wv[word])
@@ -207,9 +207,9 @@ def _get_examples(self, sentences: List[str], best_labels, n_clusters: int):
         'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']
 
     # first loading udpipe to segement word for each sentence
-    udt_english = UdpipeTrain(language_list[1],
-                              r'C:\Users\haris\Desktop\wordFinder\english-ewt-ud-2.5-191206.udpipe',
-                              r'C:\Users\haris\Desktop\wordFinder\haris.txt')
+    # udt_english = UdpipeTrain(language_list[1],
+    #                           r'C:\Users\haris\Desktop\wordFinder\english-ewt-ud-2.5-191206.udpipe',
+    #                           r'C:\Users\haris\Desktop\wordFinder\haris.txt')
 
     cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, sentences, 2)
     print("two examples sentences: \n")

diff --git a/src/templates/cluster.html b/src/templates/cluster.html
@@ -31,6 +31,7 @@ <h3>Well done!</h3> After clustering, you get {{cluster_number}} example
                         {% for cluster_sentence in cluster_result %}
                         <li class="list-group-item d-flex justify-content-between align-items-center">
                             {{cluster_sentence}}
+                            # Add KWIC Functinality
                         </li>
                         {% endfor %}
                         {% endif %}