-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbasic_ir.py
executable file
·86 lines (69 loc) · 2.56 KB
/
basic_ir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import math
from collections import defaultdict, Counter
import pre_process as pp
'''
Basic VSM IR
Try BM25
in bm25_ir.py
'''
vsm_inverted_index = defaultdict(list)
# Collect term frequencies for each sentence in document
def extract_term_freqs(sentence):
# bag-of-words representation
tfs = Counter()
sentence = pp.process_sen(sentence, True)
for token in sentence:
# stem words
# tfs[stemmer.stem(token.lower())] += 1
# lemmatize words
# compare this to the result of stemming using the PorterStemmer
tfs[pp.lemmatize(token.lower())] += 1
return tfs
# Compute document frequencies for each term
def compute_doc_freqs(doc_term_freq):
dfs = Counter()
for tfs in doc_term_freq.values():
for term in tfs.keys():
dfs[term] += 1
return dfs
# Build the vsm inverted index for each Wiki article
def train(doc):
vsm_inverted_index.clear()
# use the above two functions: extract_term_freqs & compute_doc_freqs
# process the training data into term frequencies and document frequencies
doc_term_freqs = {}
sen_id = 0
for sentence in doc:
term_freqs = extract_term_freqs(sentence)
doc_term_freqs[sen_id] = term_freqs
sen_id += 1
M = len(doc_term_freqs)
doc_freqs = compute_doc_freqs(doc_term_freqs)
# create the vsm inverted index
for senid, term_freqs in doc_term_freqs.items():
N = sum(term_freqs.values())
length = 0
# find tf*idf values and accumulate sum of squares
tfidf_values = []
for term, count in term_freqs.items():
tfidf = float(count) / N * math.log(M / float(doc_freqs[term]))
tfidf_values.append((term, tfidf))
length += tfidf ** 2
# normalise documents by length and insert into index
length = length ** 0.5
for term, tfidf in tfidf_values:
# note the inversion of the indexing, to be term -> (doc_id, score)
vsm_inverted_index[term].append([senid, tfidf / length])
# ensure posting lists are in sorted order
for term, senids in vsm_inverted_index.items():
senids.sort()
# Query the vsm inverted index
def query_vsm(query, k=1):
accumulator = Counter()
for term in query:
postings = vsm_inverted_index[pp.lemmatize(term.lower())]
for senid, weight in postings:
accumulator[senid] += weight
accumulator_list = sorted(accumulator.items(), key=lambda item: item[1], reverse=True)
return accumulator_list
# return accumulator.most_common(k)