Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
20 changes: 10 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
requests==2.25.1
nltk==3.5
numpy~=1.19.2
beautifulsoup4==4.9.3
corpy==0.3.0
Flask==1.1.2
gensim==3.8.3
pymysql==1.0.2
pysolr-3.9.0
mysql~=5.7.24
requests==2.25.1
nltk==3.5
numpy~=1.19.2
beautifulsoup4==4.9.3
corpy==0.3.0
Flask==1.1.2
gensim==3.8.3
pymysql==1.0.2
pysolr-3.9.0
mysql~=5.7.24
scikit-learn~=0.24.1
Comment thread
Akhilraj03-Github marked this conversation as resolved.
16 changes: 3 additions & 13 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
from src.service import AppService
from flask import Flask, render_template, request, redirect, url_for, flash


app = Flask(__name__)

# TODO: need to change with the selection different language
appService = AppService()


@app.route('/')
def index():
"""
Expand All @@ -23,7 +21,6 @@ def index():
"""
return render_template('index.html')


@app.route('/find', methods=['POST'])
def find():
"""
Expand All @@ -45,7 +42,6 @@ def find():
"sel_word": sel_word,
"sel_result": appService.sel_result})


@app.route('/find2', methods=['POST'])
def find2():
language_name, sel_word = None, None
Expand All @@ -59,7 +55,6 @@ def find2():
"sel_word": sel_word,
"sel_result": appService.sel_result})


@app.route('/cluster', methods=['POST'])
def cluster():
"""
Expand All @@ -76,13 +71,8 @@ def cluster():
if not appService.udt_pre_model:
appService.config_udpipe(language_name)
cluster_model_file = word2vec_language[language_name]
cluster_result, rec_cluster_result = appService.cluster_sentences(
language_name, cluster_model_file, cluster_input_sentence, cluster_number)
return render_template('cluster.html',
cluster_number=cluster_number,
cluster_result=cluster_result,
rec_cluster_result=rec_cluster_result)

cluster_result, rec_cluster_result = appService.cluster_sentences(language_name, cluster_model_file,cluster_input_sentence, cluster_number)
return render_template('cluster.html',cluster_number=cluster_number,cluster_result=cluster_result,rec_cluster_result=rec_cluster_result)

if __name__ == '__main__':
app.run(port=3000, debug=True)
app.run(port=3000, debug=True)
170 changes: 93 additions & 77 deletions src/databaseClustering.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import mysql.connector
import pymysql.connector
from mysql.connector import errorcode
from datetime import datetime
import pandas as pd
Expand All @@ -11,32 +11,44 @@
from util import db_config



def train_model(language_name, corpus_path, save_path):

model = gensim.models.Word2Vec(sentences=corpus_path,
size=150,
window=8,
min_count=2,
workers=2,
iter=10)
model.save(save_path + language_name)
print('Save succeed')
model = gensim.models.Word2Vec(sentences=corpus_path,
size=150,
window=8,
min_count=2,
workers=2,
iter=10)
model.save(save_path + language_name)
print('Save succeed')


def load_model(save_path) -> gensim.models.Word2Vec:
filename = save_path
model = gensim.models.Word2Vec.load(filename)
print('Loading succeed')
for index, word in enumerate(model.wv.index2word):
if index == 5:
break
vec = ",".join(map(lambda i: str(i), model.wv[word]))
print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
return model

def database():

filename = save_path
model = gensim.models.Word2Vec.load(filename)
print('Loading succeed')
for index, word in enumerate(model.wv.index2word):
if index == 5:
break
vec = ",".join(map(lambda i: str(i), model.wv[word]))
print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
return model


# def database():
# db = mysql.connector.connect(
# host='localhost',
# user='root',
# password='root',
# database='psd_project'
# )
# mycursor = db.cursor()
# query_info = ("SELECT sentence FROM english_sentences")
# mycursor.execute(query_info)
# sentences_df = pd.DataFrame(mycursor.fetchall(), columns=['Sentences'])

# return sentences_df

#either 37- 48 lines code or 52-63 code needed to be there
db = mysql.connector.connect(
host=db_config['host'],
user=db_config['user'],
Expand All @@ -50,76 +62,80 @@ def database():

return sentences_df


def textProcessing(text):
no_stop =[words for words in text.split() if words.lower() not in string.punctuation]
no_stop = [words for words in text.split() if words.lower() not in string.punctuation]
return no_stop

def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int) :

n_clusters = int(n_clusters)
print("clusters are ",n_clusters)
if n_clusters <=0:
print("Parameter is Invalid")
return
if n_clusters > len(sentences):
# TODO add log
print('number of cluster bigger than sentences count')
return
# first loading model
word2vec_model = load_model(save_path)
# second geting vectors for one sentence
sent_vectors = []
default_dimn = 100
# iterator to sentence
for word1 in sentences:
print(word1)
word_vectors = []
for words in word1:
if words in word2vec_model.wv:
word_vectors.append(word2vec_model.wv[words])
else: # not in dict, fill 0
word_vectors.append([0] * default_dimn)

to_array = np.array(word_vectors)
sent_vectors.append(to_array.mean(axis=0).tolist())
kmeans = KMeans(n_clusters=n_clusters,random_state=0).fit(sent_vectors)
labels = kmeans.labels_
tmp_labels,examples = [],[]
for sent,label in zip(sentences,labels):
if label not in tmp_labels:
tmp_labels.append(label)
examples.append(sent)
if len(examples) == n_clusters:
break
# add bottom logic for cluster
if len(examples) < n_clusters:
for sent in sentences:
if sent not in examples:
examples.append(sent)
if len(examples) >= n_clusters:
break

return examples

def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int):
n_clusters = int(n_clusters)
print("clusters are ", n_clusters)
if n_clusters <= 0:
print("Parameter is Invalid")
return
if n_clusters > len(sentences):
# TODO add log
print('number of cluster bigger than sentences count')
return
# first loading model
word2vec_model = load_model(save_path)
# second geting vectors for one sentence
sent_vectors = []
default_dimn = 100
# iterator to sentence
for word1 in sentences:
print(word1)
word_vectors = []
for words in word1:

if words in word2vec_model.wv:
word_vectors.append(word2vec_model.wv[words])
else: # not in dict, fill 0
word_vectors.append([0] * default_dimn)

to_array = np.array(word_vectors)
sent_vectors.append(to_array.mean(axis=0).tolist())
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sent_vectors)
labels = kmeans.labels_
tmp_labels, examples = [], []
for sent, label in zip(sentences, labels):
if label not in tmp_labels:
tmp_labels.append(label)
examples.append(sent)
if len(examples) == n_clusters:
break
# add bottom logic for cluster
if len(examples) < n_clusters:
for sent in sentences:
if sent not in examples:
examples.append(sent)
if len(examples) >= n_clusters:
break

return examples


a = database()

# file_path = r'C:\Users\haris\Desktop\wordFinder\word2vec'
# file_path = file_path + 'English'

file_path = './corpus/word2vecmodel/'
language_name = 'english'
file_path = file_path + language_name
load_model(file_path)
print('All done')

c=a['Sentences'].apply(textProcessing)
c = a['Sentences'].apply(textProcessing)

# get word vector for one sentence
sentences = [
'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']
'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']

cluster_result = cluster_sentences(language_name, file_path,c,3)
print("two examples sentences: \n")
print(cluster_result)

15 changes: 10 additions & 5 deletions src/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def database(self):
db_config['host'],
db_config['database'])
self.cursor = self.store_data.db_connect().cursor()
query_info = "SELECT sentence FROM english_sentences"
query_info = "SELECT sentence FROM English_sentences"
self.cursor.execute(query_info)
sentences_df = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
return sentences_df
Expand All @@ -91,7 +91,7 @@ def clusteringData(self):
db_config['host'],
db_config['database'])
self.cursor = self.store_data.db_connect().cursor()
query_info = "SELECT sentence FROM english_sentences"
query_info = "SELECT sentence FROM English_sentences"
self.cursor.execute(query_info)
sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
return sentences_dataframe
Expand Down Expand Up @@ -130,7 +130,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[
words = self.udt_pre_model.word_segmentation(sent)
word_vectors = []
# iterator to word
window_words = get_keyword_window(self.sel_result[0][0], words, 5)
window_words = get_keyword_window(self.sel_result[0][0], words, 10)
for word in window_words:
if word in word2vec_model.wv:
word_vectors.append(word2vec_model.wv[word])
Expand Down Expand Up @@ -210,13 +210,18 @@ def _get_examples(self, sentences: List[str], best_labels, n_clusters: int):
]
save_path = './/corpus//english//'
# first loading udpipe to segement word for each sentence
# udt_english = UdpipeTrain(language_list[1],
# r'C:\Users\haris\Desktop\wordFinder\english-ewt-ud-2.5-191206.udpipe',
# r'C:\Users\haris\Desktop\wordFinder\haris.txt')
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is commented to are you using this as a safe reference?



udt_english = UdpipeTrain(language_list[1],
r'.//corpus//udpipemodel//english.udpipe',
r'.//corpus//english//135-0.txt')

cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, save_path, sentences='3', n_clusters=2)
'''
#cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, save_path, sentences='3', n_clusters=2)
# '''

cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, sentences, 2)
print("two examples sentences: \n")
print(cluster_result)
1 change: 1 addition & 0 deletions src/templates/cluster.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ <h3>Well done!</h3> After clustering, you get {{cluster_number}} example
{% for cluster_sentence in cluster_result %}
<li class="list-group-item d-flex justify-content-between align-items-center">
{{cluster_sentence}}
# Add KWIC Functinality
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, anyone need to add KWIC functionality code here.

</li>
{% endfor %}
{% endif %}
Expand Down
Loading