31
31
import org .apache .lucene .util .Bits ;
32
32
import org .apache .lucene .util .StringHelper ;
33
33
import org .apache .lucene .util .UnicodeUtil ;
34
+ import org .apache .lucene .util .automaton .fst .Builder ;
35
+ import org .apache .lucene .util .automaton .fst .BytesRefFSTEnum ;
36
+ import org .apache .lucene .util .automaton .fst .FST ;
37
+ import org .apache .lucene .util .automaton .fst .PositiveIntOutputs ;
38
+ import org .apache .lucene .util .automaton .fst .PairOutputs ;
34
39
35
40
import java .io .IOException ;
36
41
import java .util .Comparator ;
37
42
import java .util .Map ;
38
- import java .util .Set ;
39
43
import java .util .HashMap ;
40
- import java .util .TreeMap ;
41
- import java .util .SortedMap ;
42
- import java .util .Iterator ;
43
44
44
45
class SimpleTextFieldsReader extends FieldsProducer {
45
46
@@ -116,73 +117,39 @@ public TermsEnum terms() throws IOException {
116
117
private class SimpleTextTermsEnum extends TermsEnum {
117
118
private final IndexInput in ;
118
119
private final boolean omitTF ;
119
- private BytesRef current ;
120
120
private int docFreq ;
121
121
private long docsStart ;
122
122
private boolean ended ;
123
- private final TreeMap <BytesRef ,TermData > allTerms ;
124
- private Iterator <Map .Entry <BytesRef ,TermData >> iter ;
123
+ private final BytesRefFSTEnum <PairOutputs .Pair <Long ,Long >> fstEnum ;
125
124
126
- public SimpleTextTermsEnum (TreeMap < BytesRef , TermData > allTerms , boolean omitTF ) throws IOException {
125
+ public SimpleTextTermsEnum (FST < PairOutputs . Pair < Long , Long >> fst , boolean omitTF ) throws IOException {
127
126
this .in = (IndexInput ) SimpleTextFieldsReader .this .in .clone ();
128
- this .allTerms = allTerms ;
129
127
this .omitTF = omitTF ;
130
- iter = allTerms . entrySet (). iterator ( );
128
+ fstEnum = new BytesRefFSTEnum < PairOutputs . Pair < Long , Long >>( fst );
131
129
}
132
130
133
131
public SeekStatus seek (BytesRef text , boolean useCache /* ignored */ ) throws IOException {
134
-
135
- final SortedMap <BytesRef ,TermData > tailMap = allTerms .tailMap (text );
136
132
137
- if (tailMap .isEmpty ()) {
138
- current = null ;
133
+ fstEnum .reset ();
134
+ //System.out.println("seek to text=" + text.utf8ToString());
135
+ final BytesRefFSTEnum .InputOutput <PairOutputs .Pair <Long ,Long >> result = fstEnum .advance (text );
136
+ if (result == null ) {
137
+ //System.out.println(" end");
139
138
return SeekStatus .END ;
140
139
} else {
141
- current = tailMap .firstKey ();
142
- final TermData td = tailMap .get (current );
143
- docsStart = td .docsStart ;
144
- docFreq = td .docFreq ;
145
- iter = tailMap .entrySet ().iterator ();
146
- assert iter .hasNext ();
147
- iter .next ();
148
- if (current .equals (text )) {
149
- return SeekStatus .FOUND ;
150
- } else {
151
- return SeekStatus .NOT_FOUND ;
152
- }
153
- }
154
-
155
- /*
156
- if (current != null) {
157
- final int cmp = current.compareTo(text);
158
- if (cmp == 0) {
159
- return SeekStatus.FOUND;
160
- } else if (cmp > 0) {
161
- ended = false;
162
- in.seek(fieldStart);
163
- }
164
- } else {
165
- ended = false;
166
- in.seek(fieldStart);
167
- }
140
+ //System.out.println(" got text=" + term.utf8ToString());
141
+ PairOutputs .Pair <Long ,Long > pair = result .output ;
142
+ docsStart = pair .output1 ;
143
+ docFreq = pair .output2 .intValue ();
168
144
169
- // Naive!! This just scans... would be better to do
170
- // up-front scan to build in-RAM index
171
- BytesRef b;
172
- while((b = next()) != null) {
173
- final int cmp = b.compareTo(text);
174
- if (cmp == 0) {
175
- ended = false;
145
+ if (result .input .equals (text )) {
146
+ //System.out.println(" match docsStart=" + docsStart);
176
147
return SeekStatus .FOUND ;
177
- } else if (cmp > 0) {
178
- ended = false ;
148
+ } else {
149
+ //System.out.println(" not match docsStart=" + docsStart) ;
179
150
return SeekStatus .NOT_FOUND ;
180
151
}
181
152
}
182
- current = null;
183
- ended = true;
184
- return SeekStatus.END;
185
- */
186
153
}
187
154
188
155
@ Override
@@ -192,56 +159,20 @@ public void cacheCurrentTerm() {
192
159
@ Override
193
160
public BytesRef next () throws IOException {
194
161
assert !ended ;
195
-
196
- if (iter .hasNext ()) {
197
- Map .Entry <BytesRef ,TermData > ent = iter .next ();
198
- current = ent .getKey ();
199
- TermData td = ent .getValue ();
200
- docFreq = td .docFreq ;
201
- docsStart = td .docsStart ;
202
- return current ;
162
+ final BytesRefFSTEnum .InputOutput <PairOutputs .Pair <Long ,Long >> result = fstEnum .next ();
163
+ if (result != null ) {
164
+ final PairOutputs .Pair <Long ,Long > pair = result .output ;
165
+ docsStart = pair .output1 ;
166
+ docFreq = pair .output2 .intValue ();
167
+ return result .input ;
203
168
} else {
204
- current = null ;
205
- return null ;
206
- }
207
-
208
- /*
209
- readLine(in, scratch);
210
- if (scratch.equals(END) || scratch.startsWith(FIELD)) {
211
- ended = true;
212
- current = null;
213
169
return null ;
214
- } else {
215
- assert scratch.startsWith(TERM): "got " + scratch.utf8ToString();
216
- docsStart = in.getFilePointer();
217
- final int len = scratch.length - TERM.length;
218
- if (len > scratch2.length) {
219
- scratch2.grow(len);
220
- }
221
- System.arraycopy(scratch.bytes, TERM.length, scratch2.bytes, 0, len);
222
- scratch2.length = len;
223
- current = scratch2;
224
- docFreq = 0;
225
- long lineStart = 0;
226
- while(true) {
227
- lineStart = in.getFilePointer();
228
- readLine(in, scratch);
229
- if (scratch.equals(END) || scratch.startsWith(FIELD) || scratch.startsWith(TERM)) {
230
- break;
231
- }
232
- if (scratch.startsWith(DOC)) {
233
- docFreq++;
234
- }
235
- }
236
- in.seek(lineStart);
237
- return current;
238
170
}
239
- */
240
171
}
241
172
242
173
@ Override
243
174
public BytesRef term () {
244
- return current ;
175
+ return fstEnum . current (). input ;
245
176
}
246
177
247
178
@ Override
@@ -512,10 +443,7 @@ private class SimpleTextTerms extends Terms {
512
443
private final String field ;
513
444
private final long termsStart ;
514
445
private final boolean omitTF ;
515
-
516
- // NOTE: horribly, horribly RAM consuming, but then
517
- // SimpleText should never be used in production
518
- private final TreeMap <BytesRef ,TermData > allTerms = new TreeMap <BytesRef ,TermData >();
446
+ private FST <PairOutputs .Pair <Long ,Long >> fst ;
519
447
520
448
private final BytesRef scratch = new BytesRef (10 );
521
449
@@ -527,6 +455,8 @@ public SimpleTextTerms(String field, long termsStart) throws IOException {
527
455
}
528
456
529
457
private void loadTerms () throws IOException {
458
+ PositiveIntOutputs posIntOutputs = PositiveIntOutputs .getSingleton (false );
459
+ Builder <PairOutputs .Pair <Long ,Long >> b = new Builder <PairOutputs .Pair <Long ,Long >>(FST .INPUT_TYPE .BYTE1 , 0 , 0 , true , new PairOutputs <Long ,Long >(posIntOutputs , posIntOutputs ));
530
460
IndexInput in = (IndexInput ) SimpleTextFieldsReader .this .in .clone ();
531
461
in .seek (termsStart );
532
462
final BytesRef lastTerm = new BytesRef (10 );
@@ -536,16 +466,14 @@ private void loadTerms() throws IOException {
536
466
readLine (in , scratch );
537
467
if (scratch .equals (END ) || scratch .startsWith (FIELD )) {
538
468
if (lastDocsStart != -1 ) {
539
- allTerms .put (new BytesRef (lastTerm ),
540
- new TermData (lastDocsStart , docFreq ));
469
+ b .add (lastTerm , new PairOutputs .Pair <Long ,Long >(lastDocsStart , Long .valueOf (docFreq )));
541
470
}
542
471
break ;
543
472
} else if (scratch .startsWith (DOC )) {
544
473
docFreq ++;
545
474
} else if (scratch .startsWith (TERM )) {
546
475
if (lastDocsStart != -1 ) {
547
- allTerms .put (new BytesRef (lastTerm ),
548
- new TermData (lastDocsStart , docFreq ));
476
+ b .add (lastTerm , new PairOutputs .Pair <Long ,Long >(lastDocsStart , Long .valueOf (docFreq )));
549
477
}
550
478
lastDocsStart = in .getFilePointer ();
551
479
final int len = scratch .length - TERM .length ;
@@ -557,11 +485,23 @@ private void loadTerms() throws IOException {
557
485
docFreq = 0 ;
558
486
}
559
487
}
488
+ fst = b .finish ();
489
+ /*
490
+ PrintStream ps = new PrintStream("out.dot");
491
+ fst.toDot(ps);
492
+ ps.close();
493
+ System.out.println("SAVED out.dot");
494
+ */
495
+ //System.out.println("FST " + fst.sizeInBytes());
560
496
}
561
497
562
498
@ Override
563
499
public TermsEnum iterator () throws IOException {
564
- return new SimpleTextTermsEnum (allTerms , omitTF );
500
+ if (fst != null ) {
501
+ return new SimpleTextTermsEnum (fst , omitTF );
502
+ } else {
503
+ return TermsEnum .EMPTY ;
504
+ }
565
505
}
566
506
567
507
@ Override
0 commit comments