1
+ class QuesAnsVirtualAssistant :
2
+ """
3
+ Used for question-answering
4
+
5
+ We want a table like this
6
+ id | Question | Answer
7
+
8
+ Programmatically represented as
9
+ {
10
+ id: {question: ..., answer: ...}
11
+ }
12
+ """
13
+ pass
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+ import sqlite3
24
+ import json
25
+ import pandas as pd
26
+ import sklearn
27
+ from sklearn .feature_extraction .text import TfidfVectorizer
28
+
29
+ class QuestionAnswerVirtualAssistant :
30
+ """
31
+ Used for automatic question-answering
32
+
33
+ It works by building a reverse index store that maps
34
+ words to an id. To find the indexed questions that contain
35
+ a certain the words in the user question, we then take an
36
+ intersection of the ids, ranks the questions to pick the best fit,
37
+ then select the answer that maps to that question
38
+ """
39
+
40
+ def __init__ (self ):
41
+ """
42
+ Returns - None
43
+ Input - None
44
+ ----------
45
+ - Initialize database. we use sqlite3
46
+ - Check if the tables exist, if not create them
47
+ - maintain a class level access to the database
48
+ connection object
49
+ """
50
+ self .conn = sqlite3 .connect ("virtualassistant.sqlite3" , autocommit = True )
51
+ cur = self .conn .cursor ()
52
+ res = cur .execute ("SELECT name FROM sqlite_master WHERE name='IdToQuesAns'" )
53
+ tables_exist = res .fetchone ()
54
+
55
+ if not tables_exist :
56
+ self .conn .execute ("CREATE TABLE IdToQuesAns(id INTEGER PRIMARY KEY, question TEXT, answer TEXT)" )
57
+ self .conn .execute ('CREATE TABLE WordToId (name TEXT, value TEXT)' )
58
+ cur .execute ("INSERT INTO WordToId VALUES (?, ?)" , ("index" , "{}" ,))
59
+
60
+ def index_question_answer (self , question , answer ):
61
+ """
62
+ Returns - string
63
+ Input - str: a string of words called question
64
+ ----------
65
+ Indexes the question and answer. It does this by performing two
66
+ operations - add the question and answer to the IdToQuesAns, then
67
+ adds the words in the question to WordToId
68
+ - takes in the question and answer (str)
69
+ - passes the question and answer to a method to add them
70
+ to IdToQuesAns
71
+ - retrieves the id of the inserted ques-answer
72
+ - uses the id to call the method that adds the words of
73
+ the question to the reverse index WordToId if the word has not
74
+ already been indexed
75
+ """
76
+ row_id = self ._add_to_IdToQuesAns (question .lower (), answer .lower ())
77
+ cur = self .conn .cursor ()
78
+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
79
+ reverse_idx = json .loads (reverse_idx )
80
+ question = question .split ()
81
+ for word in question :
82
+ if word not in reverse_idx :
83
+ reverse_idx [word ] = [row_id ]
84
+ else :
85
+ if row_id not in reverse_idx [word ]:
86
+ reverse_idx [word ].append (row_id )
87
+ reverse_idx = json .dumps (reverse_idx )
88
+ cur = self .conn .cursor ()
89
+ result = cur .execute ("UPDATE WordToId SET value = (?) WHERE name='index'" , (reverse_idx ,))
90
+ return ("index successful" )
91
+
92
+ def _add_to_IdToQuesAns (self , question , answer ):
93
+ """
94
+ Returns - int: the id of the inserted document
95
+ Input - str: a string of words called `document`
96
+ ---------
97
+ - use the class-level connection object to insert the document
98
+ into the db
99
+ - retrieve and return the row id of the inserted document
100
+ """
101
+ cur = self .conn .cursor ()
102
+ res = cur .execute ("INSERT INTO IdToQuesAns (question, answer) VALUES (?, ?)" , (question , answer ,))
103
+ return res .lastrowid
104
+
105
+ def find_questions (self , user_input ):
106
+ """
107
+ Returns - <class method>: the return value of the _find_questions_with_idx method
108
+ Input - str: a string of words called `user_input`, expected to be a question
109
+ ---------
110
+ - retrieve the reverse index
111
+ - use the words contained in the user input to find all the idxs
112
+ that contain the word
113
+ - use idxs to call the _find_questions_with_idx method
114
+ - return the result of the called method
115
+ """
116
+ cur = self .conn .cursor ()
117
+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
118
+ reverse_idx = json .loads (reverse_idx )
119
+ user_input = user_input .split (" " )
120
+ all_docs_with_user_input = []
121
+ for term in user_input :
122
+ if term in reverse_idx :
123
+ all_docs_with_user_input .append (reverse_idx [term ])
124
+
125
+ if not all_docs_with_user_input : # the user_input does not exist
126
+ return []
127
+
128
+ common_idx_of_docs = set (all_docs_with_user_input [0 ])
129
+ for idx in all_docs_with_user_input [1 :]:
130
+ common_idx_of_docs .intersection_update (idx )
131
+
132
+ if not common_idx_of_docs : # the user_input does not exist
133
+ return []
134
+
135
+ return self ._find_questions_with_idx (common_idx_of_docs )
136
+
137
+ def _find_questions_with_idx (self , idxs ):
138
+ """
139
+ Returns - list[str]: the list of questions with the idxs
140
+ Input - list of idxs
141
+ ---------
142
+ - use the class-level connection object to retrieve the questions that
143
+ have the idx in the input list of idxs.
144
+ - retrieve and return these questions as a list
145
+ """
146
+ idxs = list (idxs )
147
+ cur = self .conn .cursor ()
148
+ sql = "SELECT id, question, answer FROM IdToQuesAns WHERE id in ({seq})" .format (
149
+ seq = ',' .join (['?' ]* len (idxs ))
150
+ )
151
+ result = cur .execute (sql , idxs ).fetchall ()
152
+ return (result )
153
+
154
+ def find_most_matched_question (self , user_input , corpus ):
155
+ """
156
+ Returns - list[str]: the list of [(score, most_matching_question)]
157
+ Input - user_input, and list of matching questions called corpus
158
+ ---------
159
+ - use the tfidf score to rank the questions and pick the most matching
160
+ question
161
+ """
162
+ vectorizer = TfidfVectorizer ()
163
+ tfidf_scores = vectorizer .fit_transform (corpus )
164
+ tfidf_array = pd .DataFrame (tfidf_scores .toarray (),columns = vectorizer .get_feature_names_out ())
165
+ tfidf_dict = tfidf_array .to_dict ()
166
+
167
+ user_input = user_input .split (" " )
168
+ result = []
169
+ for idx in range (len (corpus )):
170
+ result .append ([0 , corpus [idx ]])
171
+
172
+ for term in user_input :
173
+ if term in tfidf_dict :
174
+ for idx in range (len (result )):
175
+ result [idx ][0 ] += tfidf_dict [term ][idx ]
176
+ return result [0 ]
177
+
178
+ def provide_answer (self , user_input ):
179
+ """
180
+ Returns - str: the answer to the user_input
181
+ Input - str: user_input
182
+ ---------
183
+ - use the user_input to get the list of matching questions
184
+ - create a corpus which is a list of all matching questions
185
+ - create a question_map that maps questions to their respective answers
186
+ - use the user_input and corpus to find the most matching question
187
+ - return the answer that matches that question from the question_map
188
+ """
189
+ matching_questions = self .find_questions (user_input )
190
+ corpus = [item [1 ] for item in matching_questions ]
191
+ question_map = {question :answer for (id , question , answer ) in matching_questions }
192
+ score , most_matching_question = self .find_most_matched_question (user_input , corpus )
193
+ return question_map [most_matching_question ]
194
+
195
+
196
+ if __name__ == "__main__" :
197
+ va = QuestionAnswerVirtualAssistant ()
198
+ va .index_question_answer (
199
+ "What are the different types of competitions available on Kaggle" ,
200
+ "Types of Competitions Kaggle Competitions are designed to provide challenges for competitors"
201
+ )
202
+ print (
203
+ va .index_question_answer (
204
+ "How to form, manage, and disband teams in a competition" ,
205
+ "Everyone that competes in a Competition does so as a team. A team is a group of one or more users"
206
+ )
207
+ )
208
+ va .index_question_answer (
209
+ "What is Data Leakage" ,
210
+ "Data Leakage is the presence of unexpected additional information in the training data"
211
+ )
212
+ va .index_question_answer (
213
+ "How does Kaggle handle cheating" ,
214
+ "Cheating is not taken lightly on Kaggle. We monitor our compliance account"
215
+ )
216
+ print (va .provide_answer ("state Kaggle cheating policy" ))
217
+ print (va .provide_answer ("Tell me what is data leakage" ))
0 commit comments