Skip to content

Commit 3e889b5

Browse files
committedMay 11, 2021
first commit
0 parents  commit 3e889b5

9 files changed

+2575
-0
lines changed
 

‎corpus/artificial_intelligence.txt

+418
Large diffs are not rendered by default.

‎corpus/machine_learning.txt

+262
Large diffs are not rendered by default.

‎corpus/natural_language_processing.txt

+113
Large diffs are not rendered by default.

‎corpus/neural_network.txt

+509
Large diffs are not rendered by default.

‎corpus/probability.txt

+836
Large diffs are not rendered by default.

‎corpus/python.txt

+252
Large diffs are not rendered by default.

‎questions.py

+176
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
import nltk
2+
import sys
3+
import os
4+
import string
5+
from nltk.sem.logic import QuantifiedExpression
6+
import math
7+
8+
FILE_MATCHES = 1
9+
SENTENCE_MATCHES = 1
10+
11+
12+
def main():
13+
14+
# Check command-line arguments
15+
if len(sys.argv) != 2:
16+
sys.exit("Usage: python questions.py corpus")
17+
18+
# Calculate IDF values across files
19+
files = load_files(sys.argv[1])
20+
file_words = {
21+
filename: tokenize(files[filename])
22+
for filename in files
23+
}
24+
file_idfs = compute_idfs(file_words)
25+
26+
askquestion(file_idfs, file_words, files)
27+
28+
def askquestion(file_idfs, file_words, files):
29+
30+
# Prompt user for query
31+
query = set(tokenize(input("Query: ")))
32+
33+
# Determine top file matches according to TF-IDF
34+
filenames = top_files(query, file_words, file_idfs, n=FILE_MATCHES)
35+
36+
# Extract sentences from top files
37+
sentences = dict()
38+
for filename in filenames:
39+
for passage in files[filename].split("\n"):
40+
for sentence in nltk.sent_tokenize(passage):
41+
tokens = tokenize(sentence)
42+
if tokens:
43+
sentences[sentence] = tokens
44+
45+
# Compute IDF values across sentences
46+
idfs = compute_idfs(sentences)
47+
48+
# Determine top sentence matches
49+
matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)
50+
for match in matches:
51+
print(match)
52+
53+
askquestion(file_idfs, file_words, files)
54+
55+
56+
def load_files(directory):
57+
"""
58+
Given a directory name, return a dictionary mapping the filename of each
59+
`.txt` file inside that directory to the file's contents as a string.
60+
"""
61+
filenames = {}
62+
63+
path = directory + os.sep
64+
65+
for filename in os.listdir(path):
66+
if os.path.isfile(os.path.join(path, filename)):
67+
with open(os.path.join(path, filename), encoding="utf8") as f:
68+
filenames[filename] = f.read()
69+
70+
return filenames
71+
72+
73+
def tokenize(document):
74+
"""
75+
Given a document (represented as a string), return a list of all of the
76+
words in that document, in order.
77+
78+
Process document by coverting all words to lowercase, and removing any
79+
punctuation or English stopwords.
80+
81+
"""
82+
stop_words = nltk.corpus.stopwords.words("english")
83+
84+
tokenizer = nltk.word_tokenize(document.lower())
85+
86+
return [word for word in tokenizer if word not in string.punctuation and word not in stop_words]
87+
88+
89+
def compute_idfs(documents):
90+
"""
91+
Given a dictionary of `documents` that maps names of documents to a list
92+
of words, return a dictionary that maps words to their IDF values.
93+
94+
Any word that appears in at least one of the documents should be in the
95+
resulting dictionary.
96+
"""
97+
idfvalues = {}
98+
99+
numberofdoc = len(documents)
100+
101+
counter = 0
102+
103+
for words in documents.values():
104+
for word in words:
105+
for words2 in documents.values():
106+
if word in words2:
107+
counter += 1
108+
idfvalues[word] = math.log(numberofdoc/counter)
109+
counter = 0
110+
111+
112+
return idfvalues
113+
114+
def top_files(query, files, idfs, n):
115+
"""
116+
Given a `query` (a set of words), `files` (a dictionary mapping names of
117+
files to a list of their words), and `idfs` (a dictionary mapping words
118+
to their IDF values), return a list of the filenames of the the `n` top
119+
files that match the query, ranked according to tf-idf.
120+
"""
121+
tfidf = {}
122+
123+
for file in files.keys():
124+
tfidf[file] = 0
125+
126+
#for each in the query set
127+
for word in query:
128+
for file, values in files.items():
129+
if word in values:
130+
tfidf[file] += idfs[word] * values.count(word)
131+
132+
querymatch = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
133+
134+
return ([element[0] for element in querymatch][:n])
135+
136+
137+
138+
139+
140+
def top_sentences(query, sentences, idfs, n):
141+
"""
142+
Given a `query` (a set of words), `sentences` (a dictionary mapping
143+
sentences to a list of their words), and `idfs` (a dictionary mapping words
144+
to their IDF values), return a list of the `n` top sentences that match
145+
the query, ranked according to idf. If there are ties, preference should
146+
be given to sentences that have a higher query term density.
147+
"""
148+
149+
sentencesvalues = {}
150+
151+
for sentence, words in sentences.items():
152+
querywords = query.intersection(words)
153+
154+
value = 0
155+
for word in querywords:
156+
value += idfs[word]
157+
158+
numwordsinquery = sum(map(lambda x: x in querywords, words))
159+
160+
query_term_density = numwordsinquery/ len(words)
161+
162+
sentencesvalues[sentence] = {
163+
'idf': value,
164+
'qtd': query_term_density,
165+
}
166+
167+
ranked_sentences = sorted(sentencesvalues.items(), key=lambda x: (x[1]['idf'], x[1]['qtd']), reverse=True)
168+
ranked_sentences = [x[0] for x in ranked_sentences]
169+
170+
return ranked_sentences[:n]
171+
172+
173+
174+
175+
if __name__ == "__main__":
176+
main()

‎readme.md

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# AI to parse documents and answer questions using tf-idf values
2+
3+
### Usage:
4+
5+
- git clone https://github.com/prithvijitguha/QuestionAI.git
6+
- pip3 install -r requirements.txt
7+
- "python questions.py corpus"
8+
- Query: "Ask question"

‎requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nltk

0 commit comments

Comments
 (0)
Please sign in to comment.