Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ public class FSTCompiler<T> {

private final IntsRefBuilder lastInput = new IntsRefBuilder();

// indicates whether we are not yet to write the padding byte
private boolean paddingBytePending;

// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
// left this as an array:
Expand Down Expand Up @@ -164,15 +167,14 @@ private FSTCompiler(
boolean allowFixedLengthArcs,
DataOutput dataOutput,
float directAddressingMaxOversizingFactor,
int version)
throws IOException {
int version) {
this.allowFixedLengthArcs = allowFixedLengthArcs;
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
this.version = version;
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
dataOutput.writeByte((byte) 0);
// the stop state w/ no arcs. the actual byte will be written lazily
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm could we instead add a boolean paddingBytePending or so? Set it to true, here, then when the byte is lazily written, set it to false, write the byte, and increment numBytesWritten at that point? Otherwise it's sort of weird to increment numBytesWritten when we didn't actually write the byte yet?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

numBytesWritten is used for determine the address of the to be written nodes, so if we don't increment it here, it would mess up the address.

numBytesWritten++;
paddingBytePending = true;
this.dataOutput = dataOutput;
fst =
new FST<>(
Expand Down Expand Up @@ -344,7 +346,7 @@ public Builder<T> setVersion(int version) {
}

/** Creates a new {@link FSTCompiler}. */
public FSTCompiler<T> build() throws IOException {
public FSTCompiler<T> build() {
// create a default DataOutput if not specified
if (dataOutput == null) {
dataOutput = getOnHeapReaderWriter(15);
Expand Down Expand Up @@ -552,13 +554,27 @@ long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
}

reverseScratchBytes();
// write the padding byte if needed
if (paddingBytePending) {
writePaddingByte();
}
scratchBytes.writeTo(dataOutput);
numBytesWritten += scratchBytes.getPosition();

nodeCount++;
return numBytesWritten - 1;
}

/**
* Write the padding byte, ensure no node gets address 0 which is reserved to mean the stop state
* w/ no arcs
*/
private void writePaddingByte() throws IOException {
assert paddingBytePending;
dataOutput.writeByte((byte) 0);
paddingBytePending = false;
}

private void writeLabel(DataOutput out, int v) throws IOException {
assert v >= 0 : "v=" + v;
if (fst.metadata.inputType == INPUT_TYPE.BYTE1) {
Expand Down Expand Up @@ -967,7 +983,11 @@ public FST<T> compile() throws IOException {
freezeTail(0);
if (root.numArcs == 0) {
if (fst.metadata.emptyOutput == null) {
// return null for completely empty FST which accepts nothing
return null;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a comment that this means a completely empty FST? Accepts nothing?

} else {
// we haven't written the padding byte so far, but the FST is still valid
writePaddingByte();
}
}

Expand Down