Skip to content

Commit 5b6ee2f

Browse files
committed
finished up the backend
1 parent d06189d commit 5b6ee2f

File tree

2 files changed

+219
-0
lines changed

2 files changed

+219
-0
lines changed
+217
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
class QuesAnsVirtualAssistant:
2+
"""
3+
Used for question-answering
4+
5+
We want a table like this
6+
id | Question | Answer
7+
8+
Programmatically represented as
9+
{
10+
id: {question: ..., answer: ...}
11+
}
12+
"""
13+
pass
14+
15+
16+
17+
18+
19+
20+
21+
22+
23+
import sqlite3
24+
import json
25+
import pandas as pd
26+
import sklearn
27+
from sklearn.feature_extraction.text import TfidfVectorizer
28+
29+
class QuestionAnswerVirtualAssistant:
30+
"""
31+
Used for automatic question-answering
32+
33+
It works by building a reverse index store that maps
34+
words to an id. To find the indexed questions that contain
35+
a certain the words in the user question, we then take an
36+
intersection of the ids, ranks the questions to pick the best fit,
37+
then select the answer that maps to that question
38+
"""
39+
40+
def __init__(self):
41+
"""
42+
Returns - None
43+
Input - None
44+
----------
45+
- Initialize database. we use sqlite3
46+
- Check if the tables exist, if not create them
47+
- maintain a class level access to the database
48+
connection object
49+
"""
50+
self.conn = sqlite3.connect("virtualassistant.sqlite3", autocommit=True)
51+
cur = self.conn.cursor()
52+
res = cur.execute("SELECT name FROM sqlite_master WHERE name='IdToQuesAns'")
53+
tables_exist = res.fetchone()
54+
55+
if not tables_exist:
56+
self.conn.execute("CREATE TABLE IdToQuesAns(id INTEGER PRIMARY KEY, question TEXT, answer TEXT)")
57+
self.conn.execute('CREATE TABLE WordToId (name TEXT, value TEXT)')
58+
cur.execute("INSERT INTO WordToId VALUES (?, ?)", ("index", "{}",))
59+
60+
def index_question_answer(self, question, answer):
61+
"""
62+
Returns - string
63+
Input - str: a string of words called question
64+
----------
65+
Indexes the question and answer. It does this by performing two
66+
operations - add the question and answer to the IdToQuesAns, then
67+
adds the words in the question to WordToId
68+
- takes in the question and answer (str)
69+
- passes the question and answer to a method to add them
70+
to IdToQuesAns
71+
- retrieves the id of the inserted ques-answer
72+
- uses the id to call the method that adds the words of
73+
the question to the reverse index WordToId if the word has not
74+
already been indexed
75+
"""
76+
row_id = self._add_to_IdToQuesAns(question.lower(), answer.lower())
77+
cur = self.conn.cursor()
78+
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
79+
reverse_idx = json.loads(reverse_idx)
80+
question = question.split()
81+
for word in question:
82+
if word not in reverse_idx:
83+
reverse_idx[word] = [row_id]
84+
else:
85+
if row_id not in reverse_idx[word]:
86+
reverse_idx[word].append(row_id)
87+
reverse_idx = json.dumps(reverse_idx)
88+
cur = self.conn.cursor()
89+
result = cur.execute("UPDATE WordToId SET value = (?) WHERE name='index'", (reverse_idx,))
90+
return("index successful")
91+
92+
def _add_to_IdToQuesAns(self, question, answer):
93+
"""
94+
Returns - int: the id of the inserted document
95+
Input - str: a string of words called `document`
96+
---------
97+
- use the class-level connection object to insert the document
98+
into the db
99+
- retrieve and return the row id of the inserted document
100+
"""
101+
cur = self.conn.cursor()
102+
res = cur.execute("INSERT INTO IdToQuesAns (question, answer) VALUES (?, ?)", (question, answer,))
103+
return res.lastrowid
104+
105+
def find_questions(self, user_input):
106+
"""
107+
Returns - <class method>: the return value of the _find_questions_with_idx method
108+
Input - str: a string of words called `user_input`, expected to be a question
109+
---------
110+
- retrieve the reverse index
111+
- use the words contained in the user input to find all the idxs
112+
that contain the word
113+
- use idxs to call the _find_questions_with_idx method
114+
- return the result of the called method
115+
"""
116+
cur = self.conn.cursor()
117+
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
118+
reverse_idx = json.loads(reverse_idx)
119+
user_input = user_input.split(" ")
120+
all_docs_with_user_input = []
121+
for term in user_input:
122+
if term in reverse_idx:
123+
all_docs_with_user_input.append(reverse_idx[term])
124+
125+
if not all_docs_with_user_input: # the user_input does not exist
126+
return []
127+
128+
common_idx_of_docs = set(all_docs_with_user_input[0])
129+
for idx in all_docs_with_user_input[1:]:
130+
common_idx_of_docs.intersection_update(idx)
131+
132+
if not common_idx_of_docs: # the user_input does not exist
133+
return []
134+
135+
return self._find_questions_with_idx(common_idx_of_docs)
136+
137+
def _find_questions_with_idx(self, idxs):
138+
"""
139+
Returns - list[str]: the list of questions with the idxs
140+
Input - list of idxs
141+
---------
142+
- use the class-level connection object to retrieve the questions that
143+
have the idx in the input list of idxs.
144+
- retrieve and return these questions as a list
145+
"""
146+
idxs = list(idxs)
147+
cur = self.conn.cursor()
148+
sql="SELECT id, question, answer FROM IdToQuesAns WHERE id in ({seq})".format(
149+
seq=','.join(['?']*len(idxs))
150+
)
151+
result = cur.execute(sql, idxs).fetchall()
152+
return(result)
153+
154+
def find_most_matched_question(self, user_input, corpus):
155+
"""
156+
Returns - list[str]: the list of [(score, most_matching_question)]
157+
Input - user_input, and list of matching questions called corpus
158+
---------
159+
- use the tfidf score to rank the questions and pick the most matching
160+
question
161+
"""
162+
vectorizer = TfidfVectorizer()
163+
tfidf_scores = vectorizer.fit_transform(corpus)
164+
tfidf_array = pd.DataFrame(tfidf_scores.toarray(),columns=vectorizer.get_feature_names_out())
165+
tfidf_dict = tfidf_array.to_dict()
166+
167+
user_input = user_input.split(" ")
168+
result = []
169+
for idx in range(len(corpus)):
170+
result.append([0, corpus[idx]])
171+
172+
for term in user_input:
173+
if term in tfidf_dict:
174+
for idx in range(len(result)):
175+
result[idx][0] += tfidf_dict[term][idx]
176+
return result[0]
177+
178+
def provide_answer(self, user_input):
179+
"""
180+
Returns - str: the answer to the user_input
181+
Input - str: user_input
182+
---------
183+
- use the user_input to get the list of matching questions
184+
- create a corpus which is a list of all matching questions
185+
- create a question_map that maps questions to their respective answers
186+
- use the user_input and corpus to find the most matching question
187+
- return the answer that matches that question from the question_map
188+
"""
189+
matching_questions = self.find_questions(user_input)
190+
corpus = [item[1] for item in matching_questions]
191+
question_map = {question:answer for (id, question, answer) in matching_questions}
192+
score, most_matching_question = self.find_most_matched_question(user_input, corpus)
193+
return question_map[most_matching_question]
194+
195+
196+
if __name__ == "__main__":
197+
va = QuestionAnswerVirtualAssistant()
198+
va.index_question_answer(
199+
"What are the different types of competitions available on Kaggle",
200+
"Types of Competitions Kaggle Competitions are designed to provide challenges for competitors"
201+
)
202+
print(
203+
va.index_question_answer(
204+
"How to form, manage, and disband teams in a competition",
205+
"Everyone that competes in a Competition does so as a team. A team is a group of one or more users"
206+
)
207+
)
208+
va.index_question_answer(
209+
"What is Data Leakage",
210+
"Data Leakage is the presence of unexpected additional information in the training data"
211+
)
212+
va.index_question_answer(
213+
"How does Kaggle handle cheating",
214+
"Cheating is not taken lightly on Kaggle. We monitor our compliance account"
215+
)
216+
print(va.provide_answer("state Kaggle cheating policy"))
217+
print(va.provide_answer("Tell me what is data leakage"))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pandas
2+
scikit-learn

0 commit comments

Comments
 (0)