Skip to content

Commit 994aaec

Browse files
committed
LUCENE-2792: add FST impl
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1044834 13f79535-47bb-0310-9956-ffa450edef68
1 parent c45253d commit 994aaec

22 files changed

+4288
-137
lines changed

Diff for: .hgignore

+2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
syntax: glob
22
*/build/*
3+
*.class
4+

Diff for: lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java

+46-106
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,16 @@
3131
import org.apache.lucene.util.Bits;
3232
import org.apache.lucene.util.StringHelper;
3333
import org.apache.lucene.util.UnicodeUtil;
34+
import org.apache.lucene.util.automaton.fst.Builder;
35+
import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum;
36+
import org.apache.lucene.util.automaton.fst.FST;
37+
import org.apache.lucene.util.automaton.fst.PositiveIntOutputs;
38+
import org.apache.lucene.util.automaton.fst.PairOutputs;
3439

3540
import java.io.IOException;
3641
import java.util.Comparator;
3742
import java.util.Map;
38-
import java.util.Set;
3943
import java.util.HashMap;
40-
import java.util.TreeMap;
41-
import java.util.SortedMap;
42-
import java.util.Iterator;
4344

4445
class SimpleTextFieldsReader extends FieldsProducer {
4546

@@ -116,73 +117,39 @@ public TermsEnum terms() throws IOException {
116117
private class SimpleTextTermsEnum extends TermsEnum {
117118
private final IndexInput in;
118119
private final boolean omitTF;
119-
private BytesRef current;
120120
private int docFreq;
121121
private long docsStart;
122122
private boolean ended;
123-
private final TreeMap<BytesRef,TermData> allTerms;
124-
private Iterator<Map.Entry<BytesRef,TermData>> iter;
123+
private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
125124

126-
public SimpleTextTermsEnum(TreeMap<BytesRef,TermData> allTerms, boolean omitTF) throws IOException {
125+
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
127126
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
128-
this.allTerms = allTerms;
129127
this.omitTF = omitTF;
130-
iter = allTerms.entrySet().iterator();
128+
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
131129
}
132130

133131
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
134-
135-
final SortedMap<BytesRef,TermData> tailMap = allTerms.tailMap(text);
136132

137-
if (tailMap.isEmpty()) {
138-
current = null;
133+
fstEnum.reset();
134+
//System.out.println("seek to text=" + text.utf8ToString());
135+
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.advance(text);
136+
if (result == null) {
137+
//System.out.println(" end");
139138
return SeekStatus.END;
140139
} else {
141-
current = tailMap.firstKey();
142-
final TermData td = tailMap.get(current);
143-
docsStart = td.docsStart;
144-
docFreq = td.docFreq;
145-
iter = tailMap.entrySet().iterator();
146-
assert iter.hasNext();
147-
iter.next();
148-
if (current.equals(text)) {
149-
return SeekStatus.FOUND;
150-
} else {
151-
return SeekStatus.NOT_FOUND;
152-
}
153-
}
154-
155-
/*
156-
if (current != null) {
157-
final int cmp = current.compareTo(text);
158-
if (cmp == 0) {
159-
return SeekStatus.FOUND;
160-
} else if (cmp > 0) {
161-
ended = false;
162-
in.seek(fieldStart);
163-
}
164-
} else {
165-
ended = false;
166-
in.seek(fieldStart);
167-
}
140+
//System.out.println(" got text=" + term.utf8ToString());
141+
PairOutputs.Pair<Long,Long> pair = result.output;
142+
docsStart = pair.output1;
143+
docFreq = pair.output2.intValue();
168144

169-
// Naive!! This just scans... would be better to do
170-
// up-front scan to build in-RAM index
171-
BytesRef b;
172-
while((b = next()) != null) {
173-
final int cmp = b.compareTo(text);
174-
if (cmp == 0) {
175-
ended = false;
145+
if (result.input.equals(text)) {
146+
//System.out.println(" match docsStart=" + docsStart);
176147
return SeekStatus.FOUND;
177-
} else if (cmp > 0) {
178-
ended = false;
148+
} else {
149+
//System.out.println(" not match docsStart=" + docsStart);
179150
return SeekStatus.NOT_FOUND;
180151
}
181152
}
182-
current = null;
183-
ended = true;
184-
return SeekStatus.END;
185-
*/
186153
}
187154

188155
@Override
@@ -192,56 +159,20 @@ public void cacheCurrentTerm() {
192159
@Override
193160
public BytesRef next() throws IOException {
194161
assert !ended;
195-
196-
if (iter.hasNext()) {
197-
Map.Entry<BytesRef,TermData> ent = iter.next();
198-
current = ent.getKey();
199-
TermData td = ent.getValue();
200-
docFreq = td.docFreq;
201-
docsStart = td.docsStart;
202-
return current;
162+
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
163+
if (result != null) {
164+
final PairOutputs.Pair<Long,Long> pair = result.output;
165+
docsStart = pair.output1;
166+
docFreq = pair.output2.intValue();
167+
return result.input;
203168
} else {
204-
current = null;
205-
return null;
206-
}
207-
208-
/*
209-
readLine(in, scratch);
210-
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
211-
ended = true;
212-
current = null;
213169
return null;
214-
} else {
215-
assert scratch.startsWith(TERM): "got " + scratch.utf8ToString();
216-
docsStart = in.getFilePointer();
217-
final int len = scratch.length - TERM.length;
218-
if (len > scratch2.length) {
219-
scratch2.grow(len);
220-
}
221-
System.arraycopy(scratch.bytes, TERM.length, scratch2.bytes, 0, len);
222-
scratch2.length = len;
223-
current = scratch2;
224-
docFreq = 0;
225-
long lineStart = 0;
226-
while(true) {
227-
lineStart = in.getFilePointer();
228-
readLine(in, scratch);
229-
if (scratch.equals(END) || scratch.startsWith(FIELD) || scratch.startsWith(TERM)) {
230-
break;
231-
}
232-
if (scratch.startsWith(DOC)) {
233-
docFreq++;
234-
}
235-
}
236-
in.seek(lineStart);
237-
return current;
238170
}
239-
*/
240171
}
241172

242173
@Override
243174
public BytesRef term() {
244-
return current;
175+
return fstEnum.current().input;
245176
}
246177

247178
@Override
@@ -512,10 +443,7 @@ private class SimpleTextTerms extends Terms {
512443
private final String field;
513444
private final long termsStart;
514445
private final boolean omitTF;
515-
516-
// NOTE: horribly, horribly RAM consuming, but then
517-
// SimpleText should never be used in production
518-
private final TreeMap<BytesRef,TermData> allTerms = new TreeMap<BytesRef,TermData>();
446+
private FST<PairOutputs.Pair<Long,Long>> fst;
519447

520448
private final BytesRef scratch = new BytesRef(10);
521449

@@ -527,6 +455,8 @@ public SimpleTextTerms(String field, long termsStart) throws IOException {
527455
}
528456

529457
private void loadTerms() throws IOException {
458+
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
459+
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
530460
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
531461
in.seek(termsStart);
532462
final BytesRef lastTerm = new BytesRef(10);
@@ -536,16 +466,14 @@ private void loadTerms() throws IOException {
536466
readLine(in, scratch);
537467
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
538468
if (lastDocsStart != -1) {
539-
allTerms.put(new BytesRef(lastTerm),
540-
new TermData(lastDocsStart, docFreq));
469+
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
541470
}
542471
break;
543472
} else if (scratch.startsWith(DOC)) {
544473
docFreq++;
545474
} else if (scratch.startsWith(TERM)) {
546475
if (lastDocsStart != -1) {
547-
allTerms.put(new BytesRef(lastTerm),
548-
new TermData(lastDocsStart, docFreq));
476+
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
549477
}
550478
lastDocsStart = in.getFilePointer();
551479
final int len = scratch.length - TERM.length;
@@ -557,11 +485,23 @@ private void loadTerms() throws IOException {
557485
docFreq = 0;
558486
}
559487
}
488+
fst = b.finish();
489+
/*
490+
PrintStream ps = new PrintStream("out.dot");
491+
fst.toDot(ps);
492+
ps.close();
493+
System.out.println("SAVED out.dot");
494+
*/
495+
//System.out.println("FST " + fst.sizeInBytes());
560496
}
561497

562498
@Override
563499
public TermsEnum iterator() throws IOException {
564-
return new SimpleTextTermsEnum(allTerms, omitTF);
500+
if (fst != null) {
501+
return new SimpleTextTermsEnum(fst, omitTF);
502+
} else {
503+
return TermsEnum.EMPTY;
504+
}
565505
}
566506

567507
@Override

Diff for: lucene/src/java/org/apache/lucene/util/ArrayUtil.java

+27-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import java.util.Collection;
2121
import java.util.Comparator;
22+
import java.lang.reflect.Array;
2223

2324
/**
2425
* Methods for manipulating arrays.
@@ -392,7 +393,7 @@ public static int hashCode(char[] array, int start, int end) {
392393
}
393394

394395
/**
395-
* Returns hash of chars in range start (inclusive) to
396+
* Returns hash of bytes in range start (inclusive) to
396397
* end (inclusive)
397398
*/
398399
public static int hashCode(byte[] array, int start, int end) {
@@ -429,6 +430,31 @@ public static boolean equals(char[] left, int offsetLeft, char[] right, int offs
429430
return false;
430431
}
431432

433+
public static <T> T[] grow(T[] array, int minSize) {
434+
if (array.length < minSize) {
435+
@SuppressWarnings("unchecked") final T[] newArray =
436+
(T[]) Array.newInstance(array.getClass().getComponentType(), oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJ_REF));
437+
System.arraycopy(array, 0, newArray, 0, array.length);
438+
return newArray;
439+
} else
440+
return array;
441+
}
442+
443+
public static <T> T[] grow(T[] array) {
444+
return grow(array, 1 + array.length);
445+
}
446+
447+
public static <T> T[] shrink(T[] array, int targetSize) {
448+
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJ_REF);
449+
if (newSize != array.length) {
450+
@SuppressWarnings("unchecked") final T[] newArray =
451+
(T[]) Array.newInstance(array.getClass().getComponentType(), newSize);
452+
System.arraycopy(array, 0, newArray, 0, newSize);
453+
return newArray;
454+
} else
455+
return array;
456+
}
457+
432458
// Since Arrays.equals doesn't implement offsets for equals
433459
/**
434460
* See if two array slices are the same.

Diff for: lucene/src/java/org/apache/lucene/util/IntsRef.java

+40-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
* existing int[].
2222
*
2323
* @lucene.internal */
24-
public final class IntsRef {
24+
public final class IntsRef implements Comparable<IntsRef> {
2525

2626
public int[] ints;
2727
public int offset;
@@ -81,6 +81,31 @@ public boolean intsEquals(IntsRef other) {
8181
}
8282
}
8383

84+
/** Signed int order comparison */
85+
public int compareTo(IntsRef other) {
86+
if (this == other) return 0;
87+
88+
final int[] aInts = this.ints;
89+
int aUpto = this.offset;
90+
final int[] bInts = other.ints;
91+
int bUpto = other.offset;
92+
93+
final int aStop = aUpto + Math.min(this.length, other.length);
94+
95+
while(aUpto < aStop) {
96+
int aInt = aInts[aUpto++];
97+
int bInt = bInts[bUpto++];
98+
if (aInt > bInt) {
99+
return 1;
100+
} else if (aInt < bInt) {
101+
return -1;
102+
}
103+
}
104+
105+
// One is a prefix of the other, or, they are equal:
106+
return this.length - other.length;
107+
}
108+
84109
public void copy(IntsRef other) {
85110
if (ints == null) {
86111
ints = new int[other.length];
@@ -97,4 +122,18 @@ public void grow(int newLength) {
97122
ints = ArrayUtil.grow(ints, newLength);
98123
}
99124
}
125+
126+
public String toString() {
127+
StringBuilder sb = new StringBuilder();
128+
sb.append('[');
129+
final int end = offset + length;
130+
for(int i=offset;i<end;i++) {
131+
if (i > offset) {
132+
sb.append(' ');
133+
}
134+
sb.append(Integer.toHexString(ints[i]));
135+
}
136+
sb.append(']');
137+
return sb.toString();
138+
}
100139
}

Diff for: lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java

+1-7
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,7 @@ public synchronized byte[] getByteBlock() {
9393
@Override
9494
public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
9595
final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start);
96-
final int size = freeBlocks + numBlocks;
97-
if (size >= freeByteBlocks.length) {
98-
final byte[][] newBlocks = new byte[ArrayUtil.oversize(size,
99-
RamUsageEstimator.NUM_BYTES_OBJ_REF)][];
100-
System.arraycopy(freeByteBlocks, 0, newBlocks, 0, freeBlocks);
101-
freeByteBlocks = newBlocks;
102-
}
96+
freeByteBlocks = ArrayUtil.grow(freeByteBlocks, freeBlocks + numBlocks);
10397
final int stop = start + numBlocks;
10498
for (int i = start; i < stop; i++) {
10599
freeByteBlocks[freeBlocks++] = blocks[i];

Diff for: lucene/src/java/org/apache/lucene/util/automaton/Automaton.java

+1-4
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
import java.util.Set;
4141

4242
import org.apache.lucene.util.ArrayUtil;
43-
import org.apache.lucene.util.RamUsageEstimator;
4443

4544
/**
4645
* Finite-state automaton with regular expression operations.
@@ -281,9 +280,7 @@ public State[] getNumberedStates() {
281280
worklist.add(t.to);
282281
t.to.number = upto;
283282
if (upto == numberedStates.length) {
284-
final State[] newArray = new State[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJ_REF)];
285-
System.arraycopy(numberedStates, 0, newArray, 0, upto);
286-
numberedStates = newArray;
283+
numberedStates = ArrayUtil.grow(numberedStates);
287284
}
288285
numberedStates[upto] = t.to;
289286
upto++;

0 commit comments

Comments
 (0)