Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
requests==2.25.1
nltk==3.5
numpy==1.20.1
numpy~=1.19.2
beautifulsoup4==4.9.3
corpy==0.3.0
Flask==1.1.2
gensim==3.8.3
pymysql==1.0.2
scikit_learn==0.24.1
pysolr-3.9.0
pysolr-3.9.0
mysql~=5.7.24
scikit-learn~=0.24.1
Comment thread
Akhilraj03-Github marked this conversation as resolved.
3 changes: 1 addition & 2 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ def cluster():
if not appService.udt_pre_model:
appService.config_udpipe(language_name)
cluster_model_file = word2vec_language[language_name]
cluster_result, rec_cluster_result = appService.cluster_sentences(
language_name, cluster_model_file, cluster_input_sentence, cluster_number)
cluster_result, rec_cluster_result = appService.cluster_sentences(language_name, cluster_model_file,cluster_input_sentence, cluster_number)
return render_template('cluster.html',
cluster_number=cluster_number,
cluster_result=cluster_result,
Expand Down
176 changes: 87 additions & 89 deletions src/databaseClustering.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import mysql.connector
import pymysql.connector
from mysql.connector import errorcode
from datetime import datetime
import pandas as pd
Expand All @@ -10,115 +10,113 @@
import string



def train_model(language_name, corpus_path, save_path):

model = gensim.models.Word2Vec(sentences=corpus_path,
size=150,
window=8,
min_count=2,
workers=2,
iter=10)
model.save(save_path + language_name)
print('Save succeed')
model = gensim.models.Word2Vec(sentences=corpus_path,
size=150,
window=8,
min_count=2,
workers=2,
iter=10)
model.save(save_path + language_name)
print('Save succeed')


def load_model(save_path) -> gensim.models.Word2Vec:
filename = save_path
model = gensim.models.Word2Vec.load(filename)
print('Loading succeed')
for index, word in enumerate(model.wv.index2word):
if index == 5:
break
vec = ",".join(map(lambda i: str(i), model.wv[word]))
print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
return model
filename = save_path
model = gensim.models.Word2Vec.load(filename)
print('Loading succeed')
for index, word in enumerate(model.wv.index2word):
if index == 5:
break
vec = ",".join(map(lambda i: str(i), model.wv[word]))
print(f"word #{index}/{len(model.wv.index2word)} is {word}, vec = {vec}")
return model


def database():
db = mysql.connector.connect(
host='localhost',
user='root',
password='root',
database='psd_project'
)
mycursor = db.cursor()
query_info = ("SELECT sentence FROM english_sentences")
mycursor.execute(query_info)
sentences_df = pd.DataFrame(mycursor.fetchall(), columns=['Sentences'])

return sentences_df


db = mysql.connector.connect(
host='localhost',
user='root',
password='root',
database='psd_project'
)
mycursor = db.cursor()
query_info = ("SELECT sentence FROM english_sentences")
mycursor.execute(query_info)
sentences_df= pd.DataFrame(mycursor.fetchall(), columns=['Sentences'])

return sentences_df

def textProcessing(text):
no_stop =[words for words in text.split() if words.lower() not in string.punctuation]
no_stop = [words for words in text.split() if words.lower() not in string.punctuation]
return no_stop

def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int) :

n_clusters = int(n_clusters)
print("clusters are ",n_clusters)
if n_clusters <=0:
print("Parameter is Invalid")
return
if n_clusters > len(sentences):
# TODO add log
print('number of cluster bigger than sentences count')
return
# first loading model
word2vec_model = load_model(save_path)
# second geting vectors for one sentence
sent_vectors = []
default_dimn = 100
# iterator to sentence
for word1 in sentences:
print(word1)
word_vectors = []
for words in word1:
if words in word2vec_model.wv:
word_vectors.append(word2vec_model.wv[words])
else: # not in dict, fill 0
word_vectors.append([0] * default_dimn)

to_array = np.array(word_vectors)
sent_vectors.append(to_array.mean(axis=0).tolist())
kmeans = KMeans(n_clusters=n_clusters,random_state=0).fit(sent_vectors)
labels = kmeans.labels_
tmp_labels,examples = [],[]
for sent,label in zip(sentences,labels):
if label not in tmp_labels:
tmp_labels.append(label)
examples.append(sent)
if len(examples) == n_clusters:
break
# add bottom logic for cluster
if len(examples) < n_clusters:
for sent in sentences:
if sent not in examples:
examples.append(sent)
if len(examples) >= n_clusters:
break

return examples

def cluster_sentences(language_name: str, save_path: str, sentences: List[str], n_clusters: int):
n_clusters = int(n_clusters)
print("clusters are ", n_clusters)
if n_clusters <= 0:
print("Parameter is Invalid")
return
if n_clusters > len(sentences):
# TODO add log
print('number of cluster bigger than sentences count')
return
# first loading model
word2vec_model = load_model(save_path)
# second geting vectors for one sentence
sent_vectors = []
default_dimn = 100
# iterator to sentence
for word1 in sentences:
print(word1)
word_vectors = []
for words in word1:

if words in word2vec_model.wv:
word_vectors.append(word2vec_model.wv[words])
else: # not in dict, fill 0
word_vectors.append([0] * default_dimn)

to_array = np.array(word_vectors)
sent_vectors.append(to_array.mean(axis=0).tolist())
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sent_vectors)
labels = kmeans.labels_
tmp_labels, examples = [], []
for sent, label in zip(sentences, labels):
if label not in tmp_labels:
tmp_labels.append(label)
examples.append(sent)
if len(examples) == n_clusters:
break
# add bottom logic for cluster
if len(examples) < n_clusters:
for sent in sentences:
if sent not in examples:
examples.append(sent)
if len(examples) >= n_clusters:
break

return examples


a = database()
file_path= r'C:\Users\haris\Desktop\wordFinder\word2vec'
file_path = r'C:\Users\haris\Desktop\wordFinder\word2vec'
file_path = file_path + 'English'
load_model(file_path)
print('All done')

c=a['Sentences'].apply(textProcessing)
c = a['Sentences'].apply(textProcessing)

# get word vector for one sentence
language_name = 'English'
sentences = [
'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']
'Tohru shows great loyalty to whoever he stands by, even back to the time when he was an Enforcer for the Dark Hand.',
'The Earth Demon, Dai Gui resembles a large minotaur(with the face of a guardian lion) with great strength.',
'Al Mulock was the great-grandson of Sir William Mulock(1843–1944), the former Canadian Postmaster - General.',
'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']

cluster_result = cluster_sentences(langage_name, file_path,c,3)
cluster_result = cluster_sentences(language_name, file_path, c, 3)
print("two examples sentences: \n")
print(cluster_result)

12 changes: 6 additions & 6 deletions src/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def database(self):
db_host=db_config['db_host'],
db_name=db_config['db_name'])
self.cursor = self.store_data.db_connect().cursor()
query_info = "SELECT sentence FROM english_sentences"
query_info = "SELECT sentence FROM English_sentences"
self.cursor.execute(query_info)
sentences_df = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
return sentences_df
Expand All @@ -90,7 +90,7 @@ def clusteringData(self):
db_host=db_config['db_host'],
db_name=db_config['db_name'])
self.cursor = self.store_data.db_connect().cursor()
query_info = "SELECT sentence FROM english_sentences"
query_info = "SELECT sentence FROM English_sentences"
self.cursor.execute(query_info)
sentences_dataframe = pd.DataFrame(self.cursor.fetchall(), columns=['Sentences'])
return sentences_dataframe
Expand Down Expand Up @@ -129,7 +129,7 @@ def cluster_sentences(self, language_name: str, save_path: str, sentences: List[
words = self.udt_pre_model.word_segmentation(sent)
word_vectors = []
# iterator to word
window_words = get_keyword_window(self.sel_result[0][0], words, 5)
window_words = get_keyword_window(self.sel_result[0][0], words, 10)
for word in window_words:
if word in word2vec_model.wv:
word_vectors.append(word2vec_model.wv[word])
Expand Down Expand Up @@ -207,9 +207,9 @@ def _get_examples(self, sentences: List[str], best_labels, n_clusters: int):
'Though his surviving images are scarce, his importance to the early history of photography in Asia is great.']

# first loading udpipe to segement word for each sentence
udt_english = UdpipeTrain(language_list[1],
r'C:\Users\haris\Desktop\wordFinder\english-ewt-ud-2.5-191206.udpipe',
r'C:\Users\haris\Desktop\wordFinder\haris.txt')
# udt_english = UdpipeTrain(language_list[1],
# r'C:\Users\haris\Desktop\wordFinder\english-ewt-ud-2.5-191206.udpipe',
# r'C:\Users\haris\Desktop\wordFinder\haris.txt')
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is commented to are you using this as a safe reference?


cluster_result = AppService().config_udpipe(language_name).cluster_sentences(language_name, sentences, 2)
print("two examples sentences: \n")
Expand Down
1 change: 1 addition & 0 deletions src/templates/cluster.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ <h3>Well done!</h3> After clustering, you get {{cluster_number}} example
{% for cluster_sentence in cluster_result %}
<li class="list-group-item d-flex justify-content-between align-items-center">
{{cluster_sentence}}
# Add KWIC Functinality
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, anyone need to add KWIC functionality code here.

</li>
{% endfor %}
{% endif %}
Expand Down
Loading