-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtermTF.go
171 lines (147 loc) · 3.9 KB
/
termTF.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package query
import (
"fmt"
"sort"
)
type TermTFQuery struct {
docId int32
cursor int
postings []int32
blocks []block
currentBlockIndex int
currentBlock block
term string
idf float32 // XXX: unnormalized idf
boost float32
freqBits int32
freqMask int32
}
// Splits the postings list into chunks that are binary searched and inside each
// chunk linearly searching for next advance() Basic []int32{} that the whole
// interface works on top. The Score is TF*IDF you have to specify how many bits
// from the docID are actually term frequency e.g if you want to store the
// frequency in 4 bits then document id 999 with term frequency 2 for this
// specific term could be stored as (999 << 4) | 2, usually you just store the
// floored sqrt(frequency), so 3-4 bits are enough. it is zero based, so 0 is
// frequency 1
//
// if you dont know totalDocumentsInIndex, which could be the case sometimes, pass any constant > 0
// WARNING: the query *can not* be reused
// WARNING: the query it not thread safe
func TermTF(totalDocumentsInIndex int, freqBits int32, t string, postings []int32) *TermTFQuery {
q := &TermTFQuery{
term: t,
cursor: -1,
postings: postings,
docId: NOT_READY,
currentBlock: block{maxIdx: 0, maxDoc: NOT_READY},
idf: computeIDF(totalDocumentsInIndex, len(postings)),
boost: 1,
freqBits: freqBits,
freqMask: (1 << freqBits) - 1,
}
if len(postings) == 0 {
q.idf = 0
return q
}
chunkSize := TERM_CHUNK_SIZE
q.blocks = make([]block, ((len(postings) + chunkSize - 1) / chunkSize)) // ceil
blockIndex := 0
for i := 0; i < len(postings); i += chunkSize {
minIdx := i
maxIdx := (minIdx + chunkSize) - 1
if maxIdx >= len(postings)-1 {
maxIdx = len(postings) - 1
}
q.blocks[blockIndex] = block{
maxDoc: postings[maxIdx] >> q.freqBits,
maxIdx: maxIdx,
}
blockIndex++
}
return q
}
func (t *TermTFQuery) GetDocId() int32 {
return t.docId
}
func (t *TermTFQuery) Cost() int {
return len(t.postings) - t.cursor
}
func (t *TermTFQuery) String() string {
return fmt.Sprintf("%s/%.2f", t.term, t.idf)
}
func (t *TermTFQuery) Score() float32 {
if t.docId == NO_MORE {
return 0
}
tf := float32(1 + (t.postings[t.cursor] & t.freqMask))
return tf * t.idf * t.boost
}
func (t *TermTFQuery) findBlock(target int32) int32 {
if len(t.blocks) == 0 {
return NO_MORE
}
if len(t.blocks)-t.currentBlockIndex < 32 {
for i := t.currentBlockIndex; i < len(t.blocks); i++ {
current := t.blocks[i]
if target <= current.maxDoc {
t.currentBlockIndex = i
t.currentBlock = current
return target
}
}
return NO_MORE
}
found := sort.Search(len(t.blocks)-t.currentBlockIndex, func(i int) bool {
current := t.blocks[i+t.currentBlockIndex]
return target <= current.maxDoc
}) + t.currentBlockIndex
if found < len(t.blocks) {
t.currentBlockIndex = found
t.currentBlock = t.blocks[found]
return target
}
return NO_MORE
}
func (t *TermTFQuery) Advance(target int32) int32 {
if target > t.currentBlock.maxDoc {
if t.findBlock(target) == NO_MORE {
t.docId = NO_MORE
return NO_MORE
}
}
if t.cursor < 0 {
t.cursor = 0
}
t.docId = NO_MORE
for i := t.cursor; i <= t.currentBlock.maxIdx; i++ {
x := t.postings[i] >> t.freqBits
if x >= target {
t.cursor = i
t.docId = x
return x
}
}
// invariant, can not happen because at this point we will be within the block
// panic?
return t.docId
}
func (t *TermTFQuery) Next() int32 {
t.cursor++
if t.cursor >= len(t.postings) {
t.docId = NO_MORE
} else {
t.docId = t.postings[t.cursor] >> t.freqBits
}
return t.docId
}
func (t *TermTFQuery) SetBoost(b float32) Query {
t.boost = b
return t
}
func (t *TermTFQuery) PayloadDecode(p Payload) {
panic("unsupported")
}
func (t *TermTFQuery) AddSubQuery(Query) Query {
panic("unsupported")
}