diff --git a/lucene/codecs/src/java/module-info.java b/lucene/codecs/src/java/module-info.java index 73f53fbf96b9..b7e75f5917c0 100644 --- a/lucene/codecs/src/java/module-info.java +++ b/lucene/codecs/src/java/module-info.java @@ -26,6 +26,8 @@ exports org.apache.lucene.codecs.simpletext; exports org.apache.lucene.codecs.uniformsplit; exports org.apache.lucene.codecs.uniformsplit.sharedterms; + exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat, @@ -33,7 +35,10 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat, org.apache.lucene.codecs.memory.FSTPostingsFormat, org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat, - org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat; + org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat, + org.apache.lucene.sandbox.codecs.lucene99.randomaccess + .Lucene99RandomAccessDictionaryPostingsFormat; provides org.apache.lucene.codecs.Codec with - org.apache.lucene.codecs.simpletext.SimpleTextCodec; + org.apache.lucene.codecs.simpletext.SimpleTextCodec, + org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessTermDictCodec; } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java new file mode 100644 index 000000000000..269d1e4753ec --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.BitUtil; + +final class ByteArrayByteSlice implements ByteSlice { + private final byte[] bytes; + + ByteArrayByteSlice(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public long size() { + return bytes.length; + } + + @Override + public void writeAll(DataOutput output) throws IOException { + output.writeBytes(bytes, bytes.length); + } + + @Override + public long getLong(long pos) { + return (long) BitUtil.VH_LE_LONG.get(bytes, (int) pos); + } + + @Override + public byte[] getBytes(long pos, int length) { + if (length == 0) { + return new byte[0]; + } + byte[] result = new byte[length]; + System.arraycopy(bytes, (int) pos, result, 0, length); + return result; + } + + @Override + public void readBytesTo(byte[] destination, long pos, int length) { + if (length == 0) { + return; + } + System.arraycopy(bytes, (int) pos, destination, 0, length); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java new file mode 100644 index 000000000000..1a3a8a8f0f96 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** A slice of bytes */ +interface ByteSlice { + long size(); + + void writeAll(DataOutput output) throws IOException; + + long getLong(long pos) throws IOException; + + byte[] getBytes(long pos, int length) throws IOException; + + void readBytesTo(byte[] destination, long pos, int length) throws IOException; +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java new file mode 100644 index 000000000000..7d18abc5e0a4 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; + +/** Factory of {@link ByteSlice} */ +@FunctionalInterface +interface ByteSliceProvider { + ByteSlice newByteSlice() throws IOException; +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java new file mode 100644 index 000000000000..4b616486cad0 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** + * Similar to {@link Lucene99PostingsFormat} but with a different term dictionary implementation. + * + * @lucene.experimental + */ +public final class Lucene99RandomAccessDictionaryPostingsFormat extends PostingsFormat { + static String TERM_DICT_META_HEADER_CODEC_NAME = "RandomAccessTermsDict"; + static String TERM_INDEX_HEADER_CODEC_NAME = "RandomAccessTermsDictIndex"; + static String TERM_DATA_META_HEADER_CODEC_NAME_PREFIX = "RandomAccessTermsDictTermDataMeta"; + static String TERM_DATA_HEADER_CODEC_NAME_PREFIX = "RandomAccessTermsDictTermData"; + + static String TERM_DICT_META_INFO_EXTENSION = "tmeta"; + static String TERM_INDEX_EXTENSION = "tidx"; + static String TERM_DATA_META_EXTENSION_PREFIX = "tdm"; + static String TERM_DATA_EXTENSION_PREFIX = "tdd"; + + // Increment version to change it + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Creates {@code Lucene90RandomAccessDictionaryPostingsFormat} */ + public Lucene99RandomAccessDictionaryPostingsFormat() { + super("Lucene99RandomAccess"); + } + + @Override + public String toString() { + return getName(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + Lucene99PostingsWriter postingsWriter = new Lucene99PostingsWriter(state); + boolean success = false; + try { + FieldsConsumer ret = new Lucene99RandomAccessTermsWriter(state, postingsWriter); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + Lucene99PostingsReader postingsReader = new Lucene99PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new Lucene99RandomAccessTermsReader(postingsReader, state); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java new file mode 100644 index 000000000000..255da4ed80cb --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * A Codec that uses {@link Lucene99RandomAccessDictionaryPostingsFormat} on top of {@link + * Lucene99Codec} + */ +public class Lucene99RandomAccessTermDictCodec extends FilterCodec { + private final Lucene99RandomAccessDictionaryPostingsFormat lucene99RandomAccessPostingsFormat = + new Lucene99RandomAccessDictionaryPostingsFormat(); + + public Lucene99RandomAccessTermDictCodec() { + super("Lucene99RandomAccessTermDict", new Lucene99Codec()); + } + + @Override + public PostingsFormat postingsFormat() { + return new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return lucene99RandomAccessPostingsFormat; + } + }; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java new file mode 100644 index 000000000000..4079b0e5d779 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*; +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.TERM_DATA_HEADER_CODEC_NAME_PREFIX; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; + +final class Lucene99RandomAccessTermsReader extends FieldsProducer { + private final Lucene99PostingsReader postingsReader; + private final SegmentReadState segmentReadState; + + private final IndexFilesManager indexFilesManager; + + private final HashMap perFieldTermDict; + + private boolean closed; + + Lucene99RandomAccessTermsReader( + Lucene99PostingsReader postingsReader, SegmentReadState segmentReadState) throws IOException { + this.postingsReader = postingsReader; + this.segmentReadState = segmentReadState; + this.perFieldTermDict = new HashMap<>(); + boolean success = false; + IndexFilesManager tmpIndexFilesManager = null; + try { + boolean indexManagerInitSuccess = false; + try { + tmpIndexFilesManager = new IndexFilesManager(); + this.indexFilesManager = tmpIndexFilesManager; + indexManagerInitSuccess = true; + } finally { + if (!indexManagerInitSuccess) { + IOUtils.closeWhileHandlingException(tmpIndexFilesManager); + } + } + int numFields = indexFilesManager.metaInfoIn.readVInt(); + assert numFields > 0; + for (int i = 0; i < numFields; i++) { + RandomAccessTermsDict termsDict = + RandomAccessTermsDict.deserialize( + new RandomAccessTermsDict.IndexOptionsProvider() { + @Override + public IndexOptions getIndexOptions(int fieldNumber) { + return segmentReadState.fieldInfos.fieldInfo(fieldNumber).getIndexOptions(); + } + + @Override + public boolean hasPayloads(int fieldNumber) { + return segmentReadState.fieldInfos.fieldInfo(fieldNumber).hasPayloads(); + } + }, + indexFilesManager.metaInfoIn, + indexFilesManager.termIndexIn, + indexFilesManager); + + if (termsDict.termsStats().size() > 0) { + FieldInfo fieldInfo = + segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber()); + String fieldName = fieldInfo.name; + perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict, postingsReader)); + } + } + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + try { + IOUtils.close(indexFilesManager, postingsReader); + } finally { + // The per-field term dictionary would be invalid once the underlying index files have been + // closed. + closed = true; + perFieldTermDict.clear(); + } + } + + @Override + public void checkIntegrity() throws IOException { + // Integrity is already checked in indexFilesManager + } + + @Override + public Iterator iterator() { + return perFieldTermDict.keySet().iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + return perFieldTermDict.get(field); + } + + @Override + public int size() { + return perFieldTermDict.size(); + } + + class IndexFilesManager implements RandomAccessTermsDict.TermDataInputProvider, Closeable { + private IndexInput metaInfoIn; + + private IndexInput termIndexIn; + + private final HashMap termDataInputPerType; + + private boolean closed; + + private final ArrayList openedInputs; + + public IndexFilesManager() throws IOException { + termDataInputPerType = new HashMap<>(); + openedInputs = new ArrayList<>(); + metaInfoIn = initMetaInfoInput(); + termIndexIn = initTermIndexInput(); + } + + private IndexInput initMetaInfoInput() throws IOException { + final IndexInput tmp = openAndChecksumIndexInputSafe(TERM_DICT_META_INFO_EXTENSION, false); + checkHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME); + postingsReader.init(tmp, segmentReadState); + postingsReader.checkIntegrity(); + return tmp; + } + + private IndexInput initTermIndexInput() throws IOException { + final IndexInput tmp = openAndChecksumIndexInputSafe(TERM_INDEX_EXTENSION, true); + checkHeader(tmp, TERM_INDEX_HEADER_CODEC_NAME); + return tmp; + } + + private RandomAccessTermsDict.TermDataInput openTermDataInput(TermType termType) + throws IOException { + final IndexInput metaTmp; + final IndexInput dataTmp; + metaTmp = + openAndChecksumIndexInputSafe(TERM_DATA_META_EXTENSION_PREFIX + termType.getId(), true); + checkHeader(metaTmp, TERM_DATA_META_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + dataTmp = openAndChecksumIndexInputSafe(TERM_DATA_EXTENSION_PREFIX + termType.getId(), true); + checkHeader(dataTmp, TERM_DATA_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + return new RandomAccessTermsDict.TermDataInput(metaTmp, dataTmp); + } + + /** + * Open an IndexInput for a segment local name. The IndexInput will be closed if there was any + * error happened during open and verification. + */ + private IndexInput openAndChecksumIndexInputSafe( + String segmentLocalName, boolean needRandomAccess) throws IOException { + String name = + IndexFileNames.segmentFileName( + segmentReadState.segmentInfo.name, segmentReadState.segmentSuffix, segmentLocalName); + + boolean success = false; + IndexInput input = null; + try { + input = + segmentReadState.directory.openInput( + name, needRandomAccess ? IOContext.LOAD : IOContext.READ); + openedInputs.add(input); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(input, this); + } + } + CodecUtil.checksumEntireFile(input); + return input; + } + + private void checkHeader(IndexInput input, String headerName) throws IOException { + CodecUtil.checkIndexHeader( + input, + headerName, + Lucene99RandomAccessDictionaryPostingsFormat.VERSION_START, + Lucene99RandomAccessDictionaryPostingsFormat.VERSION_CURRENT, + segmentReadState.segmentInfo.getId(), + segmentReadState.segmentSuffix); + } + + @Override + public RandomAccessTermsDict.TermDataInput getTermDataInputForType(TermType termType) + throws IOException { + RandomAccessTermsDict.TermDataInput current = termDataInputPerType.get(termType); + if (current == null) { + current = openTermDataInput(termType); + termDataInputPerType.put(termType, current); + } + return current; + } + + @Override + public void close() throws IOException { + if (this.closed) { + return; + } + this.closed = true; + IOUtils.close(openedInputs); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java new file mode 100644 index 000000000000..3fd7fdcf111c --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; + +final class Lucene99RandomAccessTermsWriter extends FieldsConsumer { + + private final SegmentWriteState segmentWriteState; + + private final Lucene99PostingsWriter postingsWriter; + + private final IndexFilesManager indexFilesManager; + + private boolean closed; + + public Lucene99RandomAccessTermsWriter( + SegmentWriteState segmentWriteState, Lucene99PostingsWriter postingsWriter) + throws IOException { + this.segmentWriteState = segmentWriteState; + this.postingsWriter = postingsWriter; + IndexFilesManager tmpIndexFilesManager = null; + boolean indexManagerInitSuccess = false; + try { + tmpIndexFilesManager = new IndexFilesManager(); + this.indexFilesManager = tmpIndexFilesManager; + indexManagerInitSuccess = true; + } finally { + if (!indexManagerInitSuccess) { + IOUtils.closeWhileHandlingException(tmpIndexFilesManager, this); + } + } + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + HashMap nonEmptyFields = new HashMap<>(); + for (String field : fields) { + Terms terms = fields.terms(field); + if (terms != null) { + nonEmptyFields.put(field, terms); + } + } + boolean success = false; + try { + indexFilesManager.writeAllHeaders(); + postingsWriter.init(indexFilesManager.metaInfoOut, segmentWriteState); + indexFilesManager.metaInfoOut.writeVInt(nonEmptyFields.size()); + + FixedBitSet docSeen = new FixedBitSet(segmentWriteState.segmentInfo.maxDoc()); + for (var entry : nonEmptyFields.entrySet()) { + TermsEnum termsEnum = entry.getValue().iterator(); + FieldInfo fieldInfo = segmentWriteState.fieldInfos.fieldInfo(entry.getKey()); + RandomAccessTermsDictWriter termsDictWriter = + new RandomAccessTermsDictWriter( + fieldInfo.number, + fieldInfo.getIndexOptions(), + fieldInfo.hasPayloads(), + indexFilesManager.metaInfoOut, + indexFilesManager.termIndexOut, + indexFilesManager); + postingsWriter.setField(fieldInfo); + + docSeen.clear(); + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + + IntBlockTermState termState = + (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms); + // TermState can be null + if (termState != null) { + termsDictWriter.add(term, termState); + } + } + termsDictWriter.finish(docSeen.cardinality()); + } + indexFilesManager.writeAllFooters(); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + IOUtils.close(indexFilesManager, postingsWriter); + + closed = true; + } + + /** + * Manages the output index files needed. It handles adding indexing header on creation and footer + * upon closing. + */ + class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputProvider, Closeable { + + private final IndexOutput metaInfoOut; + + private final IndexOutput termIndexOut; + + private final HashMap termDataOutputPerType; + + private boolean closed; + + private final ArrayList openedOutputs; + + public IndexFilesManager() throws IOException { + // populate the per-TermType term data outputs on-demand. + termDataOutputPerType = new HashMap<>(); + openedOutputs = new ArrayList<>(); + metaInfoOut = initMetaInfoOutput(); + termIndexOut = initTermIndexOutput(); + } + + private IndexOutput initMetaInfoOutput() throws IOException { + return getIndexOutputSafe(TERM_DICT_META_INFO_EXTENSION); + } + + private IndexOutput initTermIndexOutput() throws IOException { + return getIndexOutputSafe(TERM_INDEX_EXTENSION); + } + + private TermDataOutput initTermDataOutput(TermType termType) throws IOException { + final IndexOutput metaTmp; + final IndexOutput dataTmp; + metaTmp = getIndexOutputSafe(TERM_DATA_META_EXTENSION_PREFIX + termType.getId()); + writeHeader(metaTmp, TERM_DATA_META_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + dataTmp = getIndexOutputSafe(TERM_DATA_EXTENSION_PREFIX + termType.getId()); + writeHeader(dataTmp, TERM_DATA_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + return new TermDataOutput(metaTmp, dataTmp); + } + + /** + * Get an IndexOutput for a segment local name. The output will be closed if there was any error + * happened during creation. + */ + private IndexOutput getIndexOutputSafe(String segmentLocalName) throws IOException { + String name = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + segmentLocalName); + + boolean success = false; + IndexOutput output = null; + try { + output = segmentWriteState.directory.createOutput(name, segmentWriteState.context); + openedOutputs.add(output); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(output); + } + } + return output; + } + + private void writeHeader(IndexOutput output, String headerName) throws IOException { + CodecUtil.writeIndexHeader( + output, + headerName, + Lucene99RandomAccessDictionaryPostingsFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + } + + private void writeAllHeaders() throws IOException { + writeHeader(metaInfoOut, TERM_DICT_META_HEADER_CODEC_NAME); + writeHeader(termIndexOut, TERM_INDEX_HEADER_CODEC_NAME); + } + + private void writeAllFooters() throws IOException { + for (var x : openedOutputs) { + CodecUtil.writeFooter(x); + } + } + + @Override + public TermDataOutput getTermDataOutputForType(TermType termType) throws IOException { + TermDataOutput current = termDataOutputPerType.get(termType); + if (current == null) { + current = initTermDataOutput(termType); + termDataOutputPerType.put(termType, current); + } + return current; + } + + @Override + public void close() throws IOException { + if (this.closed) { + return; + } + this.closed = true; + IOUtils.close(openedOutputs); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java new file mode 100644 index 000000000000..845b0f22aed4 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.RandomAccessInput; + +final class RandomAccessInputByteSlice implements ByteSlice { + private final RandomAccessInput randomAccessInput; + + RandomAccessInputByteSlice(RandomAccessInput randomAccessInput) { + this.randomAccessInput = randomAccessInput; + } + + @Override + public long size() { + return randomAccessInput.length(); + } + + @Override + public void writeAll(DataOutput output) throws IOException { + for (long pos = 0; pos < randomAccessInput.length(); pos++) { + // For buffered inputs and outputs this should be fine. + output.writeByte(randomAccessInput.readByte(pos)); + } + } + + @Override + public long getLong(long pos) throws IOException { + return randomAccessInput.readLong(pos); + } + + @Override + public byte[] getBytes(long pos, int length) throws IOException { + if (length == 0) { + return new byte[0]; + } + byte[] result = new byte[length]; + randomAccessInput.readBytes(pos, result, 0, length); + return result; + } + + @Override + public void readBytesTo(byte[] destination, long pos, int length) throws IOException { + if (length == 0) { + return; + } + randomAccessInput.readBytes(pos, destination, 0, length); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java new file mode 100644 index 000000000000..da48eb1f57e1 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; + +/** A term dictionary that offer random-access to read a specific term */ +record RandomAccessTermsDict( + TermsStats termsStats, + TermsIndexPrimitive termsIndex, + TermDataReaderProvider termDataReaderProvider, + IndexOptions indexOptions) { + + /** test only * */ + IntBlockTermState getTermState(BytesRef term) throws IOException { + TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term); + return termDataReaderProvider + .newReader() + .getTermState(typeAndOrd.termType(), typeAndOrd.ord(), indexOptions); + } + + static RandomAccessTermsDict deserialize( + IndexOptionsProvider indexOptionsProvider, + DataInput metaInput, + DataInput termIndexInput, + TermDataInputProvider termDataInputProvider) + throws IOException { + + // (1) deserialize field stats + TermsStats termsStats = TermsStats.deserialize(metaInput); + IndexOptions indexOptions = indexOptionsProvider.getIndexOptions(termsStats.fieldNumber()); + boolean hasPayloads = indexOptionsProvider.hasPayloads(termsStats.fieldNumber()); + + // (2) deserialize terms index + TermsIndexPrimitive termsIndex = null; + if (termsStats.size() > 0) { + termsIndex = + TermsIndexPrimitive.deserialize(metaInput, termIndexInput, /* load off heap */ true); + } + + // (3) deserialize all the term data by each TermType + // (3.1) number of unique TermType this field has + int numTermTypes = metaInput.readByte(); + + // (3.2) read per TermType + TermDataReaderProvider.Builder termDataReaderBuilder = + new TermDataReaderProvider.Builder(indexOptions, hasPayloads); + for (int i = 0; i < numTermTypes; i++) { + TermType termType = TermType.fromId(metaInput.readByte()); + TermDataInput termDataInput = termDataInputProvider.getTermDataInputForType(termType); + termDataReaderBuilder.readOne( + termType, metaInput, termDataInput.metadataInput, termDataInput.dataInput); + } + + return new RandomAccessTermsDict( + termsStats, termsIndex, termDataReaderBuilder.build(), indexOptions); + } + + interface IndexOptionsProvider { + + IndexOptions getIndexOptions(int fieldNumber); + + boolean hasPayloads(int fieldNumber); + } + + record TermDataInput(IndexInput metadataInput, IndexInput dataInput) {} + + @FunctionalInterface + interface TermDataInputProvider { + + TermDataInput getTermDataInputForType(TermType termType) throws IOException; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java new file mode 100644 index 000000000000..5002f81c03ea --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** Class to write the index files for one field. */ +final class RandomAccessTermsDictWriter { + /** externally provided * */ + private final IndexOptions indexOptions; + + private final boolean hasPayloads; + private final DataOutput metaOutput; + private final DataOutput indexOutput; + + private final TermDataOutputProvider termDataOutputProvider; + + /** Internal states below * */ + private final TermDataOutput[] termDataOutputPerType = + new TermDataOutput[TermType.NUM_TOTAL_TYPES]; + + private final TermsIndexBuilder termsIndexBuilder; + + private final TermDataWriter[] termDataWriterPerType = + new TermDataWriter[TermType.NUM_TOTAL_TYPES]; + + private final TermStatsTracker termStatsTracker; + + private BytesRefBuilder previousTerm; + + RandomAccessTermsDictWriter( + int filedNumber, + IndexOptions indexOptions, + boolean hasPayloads, + DataOutput metaOutput, + DataOutput indexOutput, + TermDataOutputProvider termDataOutputProvider) + throws IOException { + this.indexOptions = indexOptions; + this.hasPayloads = hasPayloads; + this.metaOutput = metaOutput; + this.indexOutput = indexOutput; + this.termDataOutputProvider = termDataOutputProvider; + this.termStatsTracker = new TermStatsTracker(filedNumber); + this.termsIndexBuilder = new TermsIndexBuilder(); + } + + void add(BytesRef term, IntBlockTermState termState) throws IOException { + TermType termType = TermType.fromTermState(termState); + if (previousTerm == null) { + // first term, which is also the minimum term + termStatsTracker.setMinTerm(term); + previousTerm = new BytesRefBuilder(); + } + + /* There is interesting conventions to follow... + *
+     *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+     *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+     * 
+ */ + // for field that do not have freq enabled, as if each posting only has one occurrence. + if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) { + termState.totalTermFreq = termState.docFreq; + } + + termStatsTracker.recordTerm(termState); + previousTerm.copyBytes(term); + termsIndexBuilder.addTerm(term, termType); + TermDataWriter termDataWriter = getTermDataWriterForType(termType); + termDataWriter.addTermState(termState); + } + + private TermDataWriter getTermDataWriterForType(TermType termType) throws IOException { + if (termDataWriterPerType[termType.getId()] != null) { + return termDataWriterPerType[termType.getId()]; + } + + TermDataOutput termDataOutput = getTermDataOutput(termType); + TermDataWriter termDataWriter = + new TermDataWriter( + TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads), + termDataOutput.metadataOutput(), + termDataOutput.dataOutput()); + termDataWriterPerType[termType.getId()] = termDataWriter; + return termDataWriter; + } + + private TermDataOutput getTermDataOutput(TermType termType) throws IOException { + if (termDataOutputPerType[termType.getId()] == null) { + termDataOutputPerType[termType.getId()] = + termDataOutputProvider.getTermDataOutputForType(termType); + } + return termDataOutputPerType[termType.getId()]; + } + + void finish(int docCount) throws IOException { + // finish up TermsStats for this field + if (previousTerm != null) { + termStatsTracker.setMaxTerm(previousTerm.toBytesRef()); + } + termStatsTracker.setDocCount(docCount); + TermsStats termsStats = termStatsTracker.finish(); + // (1) Write field metadata + termsStats.serialize(metaOutput); + + // (2) serialize the term index + termsIndexBuilder.build().serialize(metaOutput, indexOutput); + + // (3) serialize information needed to decode per-TermType TermData + // (3.1) number of unique TermTypes this field has + int numTermTypesSeen = 0; + for (var termDataWriter : termDataWriterPerType) { + if (termDataWriter != null) { + numTermTypesSeen += 1; + } + } + metaOutput.writeByte((byte) numTermTypesSeen); + + // (3.2) (termType, metadataLength, dataLength) for each TermData + for (int i = 0; i < termDataWriterPerType.length; i++) { + var termDataWriter = termDataWriterPerType[i]; + if (termDataWriter != null) { + termDataWriter.finish(); + metaOutput.writeByte((byte) i); + metaOutput.writeVLong(termDataWriter.getTotalMetaDataBytesWritten()); + metaOutput.writeVLong(termDataWriter.getTotalDataBytesWritten()); + } + } + } + + record TermDataOutput(IndexOutput metadataOutput, IndexOutput dataOutput) {} + + @FunctionalInterface + static interface TermDataOutputProvider { + + TermDataOutput getTermDataOutputForType(TermType termType) throws IOException; + } + + static final class TermStatsTracker { + final int fieldNumber; + long size; + long sumTotalTermFreq; + long sumDocFreq; + int docCount; + BytesRef minTerm; + BytesRef maxTerm; + + TermStatsTracker(int fieldNumber) { + this.fieldNumber = fieldNumber; + } + + void recordTerm(IntBlockTermState termState) { + size += 1; + sumDocFreq += termState.docFreq; + if (termState.totalTermFreq > 0) { + sumTotalTermFreq += termState.totalTermFreq; + } + } + + void setDocCount(int docCount) { + this.docCount = docCount; + } + + void setMinTerm(BytesRef minTerm) { + this.minTerm = BytesRef.deepCopyOf(minTerm); + } + + void setMaxTerm(BytesRef maxTerm) { + this.maxTerm = BytesRef.deepCopyOf(maxTerm); + } + + TermsStats finish() { + return new TermsStats( + fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java new file mode 100644 index 000000000000..06cf69da9aa1 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; +import org.apache.lucene.util.BytesRef; + +/** + * Holds the bit-packed {@link IntBlockTermState} for a given {@link + * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType} + */ +record TermData(ByteSlice metadata, ByteSlice data) { + IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException { + long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; + long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); + long dataStartPos = metadata.getLong(metadataStartPos); + BytesRef metadataBytesRef = + new BytesRef(metadata.getBytes(metadataStartPos + 8, codec.getMetadataBytesLength())); + + int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef); + int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK)); + int startBitIndex = dataBitIndex % 8; + int numBytesToRead = (startBitIndex + numBitsPerRecord) / 8; + if ((startBitIndex + numBitsPerRecord) % 8 > 0) { + numBytesToRead += 1; + } + BytesRef dataBytesRef = + new BytesRef(data.getBytes(dataStartPos + dataBitIndex / 8, numBytesToRead)); + + return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex); + } + + IntBlockTermState getTermStateWithBufferAndReuse( + TermStateCodec codec, + long ord, + byte[] metaDataBuffer, + byte[] dataBuffer, + IntBlockTermState reuse) + throws IOException { + long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; + long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); + long dataStartPos = metadata.getLong(metadataStartPos); + + int metadataLength = codec.getMetadataBytesLength(); + metadata.readBytesTo(metaDataBuffer, metadataStartPos + 8, metadataLength); + BytesRef metadataBytesRef = new BytesRef(metaDataBuffer, 0, metadataLength); + + int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef); + int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK)); + int startBitIndex = dataBitIndex % 8; + int numBytesToRead = (startBitIndex + numBitsPerRecord) / 8; + if ((startBitIndex + numBitsPerRecord) % 8 > 0) { + numBytesToRead += 1; + } + data.readBytesTo(dataBuffer, dataStartPos + dataBitIndex / 8, numBytesToRead); + BytesRef dataBytesRef = new BytesRef(dataBuffer, 0, numBytesToRead); + + codec.decodeAtWithReuse( + metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex, reuse); + return reuse; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java new file mode 100644 index 000000000000..130094016c5d --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; + +/** Factory class to produce instances of TermData */ +record TermDataProvider(ByteSliceProvider metadataProvider, ByteSliceProvider dataProvider) { + static TermDataProvider deserializeOnHeap( + DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException { + long metadataSize = metaInput.readVLong(); + long dataSize = metaInput.readVLong(); + + if (metadataSize > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "Metadata size it too large to store on heap. Must be less than " + Integer.MAX_VALUE); + } + if (dataSize > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "Data size it too large to store on heap.Must be less than " + Integer.MAX_VALUE); + } + + byte[] metadataBytes = new byte[(int) metadataSize]; + byte[] dataBytes = new byte[(int) dataSize]; + + metadataInput.readBytes(metadataBytes, 0, metadataBytes.length); + dataInput.readBytes(dataBytes, 0, dataBytes.length); + + return new TermDataProvider( + () -> new ByteArrayByteSlice(metadataBytes), () -> new ByteArrayByteSlice(dataBytes)); + } + + static TermDataProvider deserializeOffHeap( + DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException { + final long metadataSize = metaInput.readVLong(); + final long dataSize = metaInput.readVLong(); + + final long metadataStart = metadataInput.getFilePointer(); + final long dataStart = dataInput.getFilePointer(); + + metadataInput.skipBytes(metadataSize); + dataInput.skipBytes(dataSize); + + return new TermDataProvider( + () -> + new RandomAccessInputByteSlice( + metadataInput.randomAccessSlice(metadataStart, metadataSize)), + () -> new RandomAccessInputByteSlice(dataInput.randomAccessSlice(dataStart, dataSize))); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java new file mode 100644 index 000000000000..7d66fcd6abc6 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; + +/** Factory class for {@link TermDataReader} which supports term lookup */ +final class TermDataReaderProvider { + private final TermDataProviderAndCodec[] termDataProviderAndCodecs; + + /** TermDataReader can be reused by the same thread */ + private final ThreadLocal termDataReaderReuse; + + TermDataReaderProvider(TermDataProviderAndCodec[] termDataProviderAndCodecs) { + this.termDataProviderAndCodecs = termDataProviderAndCodecs; + termDataReaderReuse = new ThreadLocal<>(); + } + + TermDataReader newReader() throws IOException { + var existingReader = termDataReaderReuse.get(); + if (existingReader != null) { + return existingReader; + } + var newReader = new TermDataReader(); + termDataReaderReuse.set(newReader); + return newReader; + } + + static class Builder { + final IndexOptions indexOptions; + final boolean hasPayloads; + final TermDataProviderAndCodec[] termDataProviderAndCodecs = + new TermDataProviderAndCodec[TermType.NUM_TOTAL_TYPES]; + + Builder(IndexOptions indexOptions, boolean hasPayloads) { + this.indexOptions = indexOptions; + this.hasPayloads = hasPayloads; + } + + void readOne( + TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn) + throws IOException { + TermDataProvider termDataProvider = + TermDataProvider.deserializeOffHeap(metaIn, termMetadataIn, termDataIn); + TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads); + termDataProviderAndCodecs[termType.getId()] = + new TermDataProviderAndCodec(termDataProvider, codec); + } + + TermDataReaderProvider build() { + return new TermDataReaderProvider(termDataProviderAndCodecs); + } + } + + record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {} + + public class TermDataReader { + private TermData[] termDataPerType; + + private byte[] metaDataBuffer; + + private byte[] dataBuffer; + + private IntBlockTermState reuse = new IntBlockTermState(); + + void maybeInitBuffer() { + if (metaDataBuffer == null || dataBuffer == null) { + int maxMetadataLengthSeen = 0; + int maxDataLengthSeen = 0; + for (int i = 0; i < termDataProviderAndCodecs.length; i++) { + if (termDataProviderAndCodecs[i] == null) { + continue; + } + var codec = termDataProviderAndCodecs[i].codec; + maxMetadataLengthSeen = Math.max(maxDataLengthSeen, codec.getMetadataBytesLength()); + maxDataLengthSeen = Math.max(maxMetadataLengthSeen, codec.getMaximumRecordSizeInBytes()); + } + metaDataBuffer = new byte[maxMetadataLengthSeen]; + dataBuffer = new byte[maxDataLengthSeen]; + } + } + + TermData getTermData(int typeId) throws IOException { + if (termDataPerType == null) { + termDataPerType = new TermData[termDataProviderAndCodecs.length]; + } + if (termDataPerType[typeId] == null) { + TermDataProvider termDataProvider = termDataProviderAndCodecs[typeId].termDataProvider; + termDataPerType[typeId] = + new TermData( + termDataProvider.metadataProvider().newByteSlice(), + termDataProvider.dataProvider().newByteSlice()); + } + return termDataPerType[typeId]; + } + + IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions) + throws IOException { + assert termDataProviderAndCodecs[termType.getId()] != null; + + maybeInitBuffer(); + + int typeId = termType.getId(); + var codec = termDataProviderAndCodecs[typeId].codec; + var termData = getTermData(typeId); + IntBlockTermState termState = + termData.getTermStateWithBufferAndReuse(codec, ord, metaDataBuffer, dataBuffer, reuse); + + // need to filling some default values for the term state + // in order to meet the expectations of the postings reader + if (termType.hasSingletonDoc()) { + termState.docFreq = 1; + } + if (termType.hasSkipData() == false) { + termState.skipOffset = -1; + } + if (termType.hasLastPositionBlockOffset() == false) { + termState.lastPosBlockOffset = -1; + } + + /* There is interesting conventions to follow... + *
+       *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+       *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+       * 
+ */ + // for field that do not have freq enabled, as if each posting only has one occurrence. + if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) { + termState.totalTermFreq = termState.docFreq; + } + + return termState; + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java new file mode 100644 index 000000000000..d69c45de9abc --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.DataOutputBitPacker; +import org.apache.lucene.store.DataOutput; + +/** Writes TermData to two separate {@link DataOutput} one for metadata, another for term data */ +final class TermDataWriter { + static final int NUM_TERMS_PER_BLOCK = 256; + + private final TermStateCodec termStateCodec; + + private final IntBlockTermStateBuffer buffer = new IntBlockTermStateBuffer(NUM_TERMS_PER_BLOCK); + + private final DataOutput metadataOut; + private final DataOutputBitPacker dataOutputBitPacker; + + private long totalMetaDataBytesWritten; + + TermDataWriter(TermStateCodec termStateCodec, DataOutput metadataOut, DataOutput dataOut) { + this.termStateCodec = termStateCodec; + this.metadataOut = metadataOut; + this.dataOutputBitPacker = new DataOutputBitPacker(dataOut); + } + + void addTermState(IntBlockTermState termState) throws IOException { + buffer.add(termState); + if (buffer.numUsed == NUM_TERMS_PER_BLOCK) { + writeBlock(); + } + } + + void finish() throws IOException { + if (buffer.numUsed > 0) { + writeBlock(); + } + } + + long getTotalMetaDataBytesWritten() { + return totalMetaDataBytesWritten; + } + + long getTotalDataBytesWritten() { + return dataOutputBitPacker.getNumBytesWritten(); + } + + private void writeBlock() throws IOException { + metadataOut.writeLong(dataOutputBitPacker.getNumBytesWritten()); + byte[] metadata = + termStateCodec.encodeBlockUpTo(buffer.elements, buffer.numUsed, dataOutputBitPacker); + metadataOut.writeBytes(metadata, metadata.length); + totalMetaDataBytesWritten += metadata.length + 8; + buffer.clear(); + } + + /** act like a minial ArrayList, but provide access to the internal array */ + static class IntBlockTermStateBuffer { + IntBlockTermState[] elements; + int numUsed; + + IntBlockTermStateBuffer(int capacity) { + this.elements = new IntBlockTermState[capacity]; + } + + void add(IntBlockTermState termState) { + elements[numUsed++] = termState; + } + + void clear() { + for (int i = 0; i < numUsed; i++) { + elements[i] = null; + } + numUsed = 0; + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java new file mode 100644 index 000000000000..081b5917b3c4 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.util.BytesRef; + +interface TermStateCodec { + + /** Get the number of bytes that the metadata per block needs. */ + int getMetadataBytesLength(); + + /** Get the maximum span of a record in terms of bytes */ + int getMaximumRecordSizeInBytes(); + + /** Get the number of bits per data record within the block, based on the provided metadata. */ + int getNumBitsPerRecord(BytesRef metadataBytes); + + /** + * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker into a block of + * bytes. + * + * @return the metadata associated with the encoded bytes + */ + default byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) throws IOException { + return encodeBlockUpTo(inputs, inputs.length, bitPacker); + } + + /** + * Encode the sequence of {@link IntBlockTermState}s up to length, with the given bitPacker into a + * block of bytes. + * + * @return the metadata associated with the encoded bytes + */ + byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int upto, BitPacker bitPacker) + throws IOException; + + /** + * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and + * data byte slice, at the given index within an encoded block. + * + *

Note: This method expects dataBytes that starts at the start of the block. Also, dataBytes + * should contain enough bytes (but not necessarily the whole block) to decode at the term state + * at `index`. + * + * @return the decoded term state + */ + IntBlockTermState decodeWithinBlock( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index); + + /** + * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and + * data byte slice, starting at `startBitIndex`. + * + *

Note: The dataBytes should contain enough bits to decode out the term state. passing more + * bytes than needed is fine but excessive ones are not used. + * + *

e.g. we want to decode a term state which contains value x, y and z, that has 18 bits in + * total. Assume x takes 4 bit, y takes 4 bit and z takes 10 bits. + * + *

Here is the visualization wh en we decode with startBitIndex=7 + * + *

+   *     Note: little-endian bit order
+   *     [x.......][zyyyyxxx][zzzzzzzz][.......z]
+   * 
+ */ + IntBlockTermState decodeAt( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex); + + /** + * Like {@link TermStateCodec#decodeAt(BytesRef, BytesRef, BitUnpacker, int)} but with a caller + * provided `IntBlockTermState` instead of returning a allocated one. + */ + void decodeAtWithReuse( + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker, + int startBitIndex, + IntBlockTermState reuse); +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java new file mode 100644 index 000000000000..8545cce8e8c3 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; + +abstract class TermStateCodecComponent { + private final String name; + + TermStateCodecComponent(String name) { + this.name = name; + } + + @Override + public String toString() { + return "TermStateCodecComponent{" + "name='" + name + '\'' + '}'; + } + + static byte getBitWidth( + IntBlockTermState[] termStates, int upTo, TermStateCodecComponent component) { + assert termStates.length > 0; + assert upTo > 0 && upTo <= termStates.length; + + long maxValSeen = -1; + long referenceValue = + component.isMonotonicallyIncreasing() ? component.getTargetValue(termStates[0]) : 0; + + for (int i = 0; i < upTo; i++) { + var termState = termStates[i]; + maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState) - referenceValue); + } + return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen)); + } + + abstract boolean isMonotonicallyIncreasing(); + + abstract long getTargetValue(IntBlockTermState termState); + + abstract void setTargetValue(IntBlockTermState termState, long value); + + /** Below are the relevant IntBlockTermState components * */ + static final class SingletonDocId extends TermStateCodecComponent { + public static SingletonDocId INSTANCE = new SingletonDocId(); + + private SingletonDocId() { + super("SingletonDocId"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.singletonDocID; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + assert value <= Integer.MAX_VALUE; + // A correct codec implementation does not change the value, + // after the encode/decode round-trip it should still be a valid int + termState.singletonDocID = (int) value; + } + } + + static final class DocFreq extends TermStateCodecComponent { + public static DocFreq INSTANCE = new DocFreq(); + + private DocFreq() { + super("DocFreq"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.docFreq; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + assert value <= Integer.MAX_VALUE; + // A correct codec implementation does not change the value, + // after the encode/decode round-trip it should still be a valid int + termState.docFreq = (int) value; + } + } + + static final class TotalTermFreq extends TermStateCodecComponent { + public static TotalTermFreq INSTANCE = new TotalTermFreq(); + + private TotalTermFreq() { + super("TotalTermFreq"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.totalTermFreq; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.totalTermFreq = value; + } + } + + static final class DocStartFP extends TermStateCodecComponent { + public static DocStartFP INSTANCE = new DocStartFP(); + + private DocStartFP() { + super("DocStartFP"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return true; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.docStartFP; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.docStartFP = value; + } + } + + static final class PositionStartFP extends TermStateCodecComponent { + public static PositionStartFP INSTANCE = new PositionStartFP(); + + private PositionStartFP() { + super("PositionStartFP"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return true; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.posStartFP; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.posStartFP = value; + } + } + + static final class PayloadStartFP extends TermStateCodecComponent { + public static PayloadStartFP INSTANCE = new PayloadStartFP(); + + private PayloadStartFP() { + super("PayloadStartFP"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return true; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.payStartFP; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.payStartFP = value; + } + } + + static final class SkipOffset extends TermStateCodecComponent { + public static SkipOffset INSTANCE = new SkipOffset(); + + private SkipOffset() { + super("SkipOffset"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.skipOffset; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.skipOffset = value; + } + } + + static final class LastPositionBlockOffset extends TermStateCodecComponent { + public static LastPositionBlockOffset INSTANCE = new LastPositionBlockOffset(); + + private LastPositionBlockOffset() { + super("LastPositionBlockOffset"); + } + + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.lastPosBlockOffset; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.lastPosBlockOffset = value; + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java new file mode 100644 index 000000000000..15fa3cbd9dde --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.DocFreq; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.DocStartFP; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.LastPositionBlockOffset; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.PayloadStartFP; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.PositionStartFP; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.SingletonDocId; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.SkipOffset; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.TotalTermFreq; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; + +final class TermStateCodecImpl implements TermStateCodec { + private final TermStateCodecComponent[] components; + private final int metadataBytesLength; + + public TermStateCodecImpl(TermStateCodecComponent[] components) { + assert components.length > 0; + + this.components = components; + int metadataBytesLength = 0; + for (var component : components) { + metadataBytesLength += getMetadataLength(component); + } + this.metadataBytesLength = metadataBytesLength; + } + + @Override + public int getMaximumRecordSizeInBytes() { + // worst case: no compression at all, so each component taks 8 byte. + // two extra bytes when the record takes partial byte at the start and end. + return components.length * 8 + 2; + } + + @Override + public int getMetadataBytesLength() { + return metadataBytesLength; + } + + @Override + public int getNumBitsPerRecord(BytesRef metadataBytes) { + int upto = metadataBytes.offset; + int totalBitsPerTermState = 0; + + for (var component : components) { + byte bitWidth = metadataBytes.bytes[upto++]; + if (component.isMonotonicallyIncreasing()) { + upto += 8; + } + totalBitsPerTermState += bitWidth; + } + + return totalBitsPerTermState; + } + + private static int getMetadataLength(TermStateCodecComponent component) { + // 1 byte for bitWidth; optionally 8 byte more for the reference value + return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0); + } + + public static TermStateCodecImpl getCodec( + TermType termType, IndexOptions indexOptions, boolean hasPayloads) { + assert indexOptions.ordinal() > IndexOptions.NONE.ordinal(); + // A term can't have skip data (has more than one block's worth of doc), + // while having a singleton doc at the same time! + assert !(termType.hasSkipData() && termType.hasSingletonDoc()); + + // Can't have payload for index options that is less than POSITIONS + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal() + || !hasPayloads; + + ArrayList components = new ArrayList<>(); + // handle docs and docFreq + if (termType.hasSingletonDoc()) { + components.add(SingletonDocId.INSTANCE); + } else { + components.add(DocStartFP.INSTANCE); + components.add(DocFreq.INSTANCE); + } + // handle skip data + if (termType.hasSkipData()) { + components.add(SkipOffset.INSTANCE); + } + + // handle freq + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + components.add(TotalTermFreq.INSTANCE); + } + // handle positions + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + components.add(PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(PayloadStartFP.INSTANCE); + } + if (termType.hasLastPositionBlockOffset()) { + components.add(LastPositionBlockOffset.INSTANCE); + } + } + // handle payload and offsets + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(PayloadStartFP.INSTANCE); + } + + return new TermStateCodecImpl(components.toArray(TermStateCodecComponent[]::new)); + } + + @Override + public String toString() { + return "TermStateCodecImpl{" + "components=" + Arrays.toString(components) + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TermStateCodecImpl that = (TermStateCodecImpl) o; + return Arrays.equals(components, that.components); + } + + @Override + public int hashCode() { + return Arrays.hashCode(components); + } + + @Override + public byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int uptop, BitPacker bitPacker) + throws IOException { + Metadata[] metadataPerComponent = getMetadataPerComponent(inputs, uptop); + byte[] metadataBytes = serializeMetadata(metadataPerComponent); + + // Encode inputs via the bitpacker + for (int i = 0; i < uptop; i++) { + encodeOne(bitPacker, inputs[i], metadataPerComponent); + } + bitPacker.flush(); + + return metadataBytes; + } + + private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs, int upTo) { + Metadata[] metadataPerComponent = new Metadata[components.length]; + for (int i = 0; i < components.length; i++) { + var component = components[i]; + byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, upTo, component); + long referenceValue = + component.isMonotonicallyIncreasing() ? component.getTargetValue(inputs[0]) : 0L; + metadataPerComponent[i] = new Metadata(bitWidth, referenceValue); + } + return metadataPerComponent; + } + + private byte[] serializeMetadata(Metadata[] metadataPerComponent) { + byte[] metadataBytes = new byte[this.metadataBytesLength]; + ByteArrayDataOutput dataOut = new ByteArrayDataOutput(metadataBytes); + + for (int i = 0; i < components.length; i++) { + var metadata = metadataPerComponent[i]; + dataOut.writeByte(metadata.bitWidth); + if (components[i].isMonotonicallyIncreasing()) { + dataOut.writeLong(metadata.referenceValue); + } + } + return metadataBytes; + } + + private void encodeOne( + BitPacker bitPacker, IntBlockTermState termState, Metadata[] metadataPerComponent) + throws IOException { + for (int i = 0; i < components.length; i++) { + var component = components[i]; + var metadata = metadataPerComponent[i]; + long valToEncode = component.getTargetValue(termState) - metadata.referenceValue; + bitPacker.add(valToEncode, metadata.bitWidth); + } + } + + @Override + public IntBlockTermState decodeWithinBlock( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index) { + assert metadataBytes.length == this.metadataBytesLength; + + int startBitIndex = index * getNumBitsPerRecord(metadataBytes); + return decodeAt(metadataBytes, dataBytes, bitUnpacker, startBitIndex); + } + + @Override + public IntBlockTermState decodeAt( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex) { + + IntBlockTermState decoded = new IntBlockTermState(); + decodeAtWithReuse(metadataBytes, dataBytes, bitUnpacker, startBitIndex, decoded); + + return decoded; + } + + @Override + public void decodeAtWithReuse( + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker, + int startBitIndex, + IntBlockTermState reuse) { + assert metadataBytes.length == this.metadataBytesLength; + + reuse.lastPosBlockOffset = -1; + reuse.skipOffset = -1; + reuse.singletonDocID = -1; + + int upto = metadataBytes.offset; + for (int i = 0; i < components.length; i++) { + var component = components[i]; + int bitWidth = metadataBytes.bytes[upto++]; + long val = bitUnpacker.unpack(dataBytes, startBitIndex, bitWidth); + if (component.isMonotonicallyIncreasing()) { + val += (long) BitUtil.VH_LE_LONG.get(metadataBytes.bytes, upto); + upto += 8; + } + component.setTargetValue(reuse, val); + startBitIndex += bitWidth; + } + } + + private record Metadata(byte bitWidth, long referenceValue) {} +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java new file mode 100644 index 000000000000..81e66540c08e --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.util.Objects; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; + +/** + * TermType holds the classification of a term, based on how its postings are written. + * + *

It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2) + * if the term has skip data. 3) if the term has an VINT encoded position block. + */ +final class TermType { + private static final byte SINGLETON_DOC_MASK = (byte) 1; + + private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1; + + private static final byte HAS_LAST_POSITION_BLOCK_OFFEST_MASK = (byte) 1 << 2; + + public static final int NUM_TOTAL_TYPES = 8; + + private final byte flag; + + private TermType(byte flag) { + this.flag = flag; + } + + int getId() { + assert this.flag >= 0 && this.flag <= 8; + return this.flag; + } + + boolean hasSingletonDoc() { + return (this.flag & SINGLETON_DOC_MASK) > 0; + } + + boolean hasSkipData() { + return (this.flag & HAS_SKIP_DATA_MASK) > 0; + } + + boolean hasLastPositionBlockOffset() { + return (this.flag & HAS_LAST_POSITION_BLOCK_OFFEST_MASK) > 0; + } + + static TermType fromTermState(IntBlockTermState state) { + byte flag = 0; + if (state.singletonDocID != -1) { + flag |= SINGLETON_DOC_MASK; + } + if (state.skipOffset != -1) { + flag |= HAS_SKIP_DATA_MASK; + } + if (state.lastPosBlockOffset != -1) { + flag |= HAS_LAST_POSITION_BLOCK_OFFEST_MASK; + } + return new TermType(flag); + } + + static TermType fromId(int id) { + if (id < 0 || id > 8) { + throw new IllegalArgumentException("id must be within range [0, 8]"); + } + return new TermType((byte) id); + } + + @Override + public int hashCode() { + return Objects.hashCode(this.flag); + } + + @Override + public boolean equals(Object that) { + return that instanceof TermType && ((TermType) that).flag == this.flag; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java new file mode 100644 index 000000000000..aebcea20856c --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -0,0 +1,369 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.TransitionAccessor; +import org.apache.lucene.util.fst.BytesRefPrimitiveLongFSTEnum; +import org.apache.lucene.util.fst.PrimitiveLongFST; +import org.apache.lucene.util.fst.PrimitiveLongFSTIntersectEnum; + +final class TermsImpl extends Terms { + private final FieldInfo fieldInfo; + + private final RandomAccessTermsDict termsDict; + + private final Lucene99PostingsReader lucene99PostingsReader; + + public TermsImpl( + FieldInfo fieldInfo, + RandomAccessTermsDict termsDict, + Lucene99PostingsReader lucene99PostingsReader) { + this.fieldInfo = fieldInfo; + this.termsDict = termsDict; + this.lucene99PostingsReader = lucene99PostingsReader; + } + + @Override + public long size() throws IOException { + return termsDict.termsStats().size(); + } + + @Override + public long getSumTotalTermFreq() throws IOException { + return termsDict.termsStats().sumTotalTermFreq(); + } + + @Override + public long getSumDocFreq() throws IOException { + return termsDict.termsStats().sumDocFreq(); + } + + @Override + public int getDocCount() throws IOException { + return termsDict.termsStats().docCount(); + } + + @Override + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal(); + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal(); + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public BytesRef getMin() throws IOException { + return termsDict.termsStats().minTerm(); + } + + @Override + public BytesRef getMax() throws IOException { + return termsDict.termsStats().maxTerm(); + } + + @Override + public TermsEnum iterator() throws IOException { + return new RandomAccessTermsEnum(); + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new RandomAccessIntersectTermsEnum(compiled, startTerm); + } + + final class RandomAccessTermsEnum extends TermsEnum { + private AttributeSource attrs; + + private BytesRef term; + + private boolean isTermStateCurrent; + + private IntBlockTermState termState; + + private final BytesRefPrimitiveLongFSTEnum fstEnum; + + private BytesRefPrimitiveLongFSTEnum.InputOutput fstSeekState; + + // Only set when seekExact(term, state) is called, because that will update + // the termState but leave the fstSeekState out of sync. + // We need to re-seek in next() calls to catch up to that term. + private boolean needReSeekInNext; + + private final TermDataReaderProvider.TermDataReader termDataReader; + + RandomAccessTermsEnum() throws IOException { + termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); + fstEnum = new BytesRefPrimitiveLongFSTEnum(termsDict.termsIndex().primitiveLongFST()); + termDataReader = termsDict.termDataReaderProvider().newReader(); + } + + void updateTermStateIfNeeded() throws IOException { + if (!isTermStateCurrent && !needReSeekInNext) { + TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstSeekState.output); + termState = + termDataReader.getTermState( + typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions()); + isTermStateCurrent = true; + } + } + + @Override + public AttributeSource attributes() { + if (attrs == null) { + attrs = new AttributeSource(); + } + return attrs; + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + fstSeekState = fstEnum.seekExact(text); + term = fstSeekState == null ? null : fstSeekState.input; + isTermStateCurrent = false; + needReSeekInNext = false; + return term != null; + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + fstSeekState = fstEnum.seekCeil(text); + term = fstSeekState == null ? null : fstSeekState.input; + isTermStateCurrent = false; + needReSeekInNext = false; + if (term == null) { + return SeekStatus.END; + } + return text.equals(term) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; + } + + @Override + public void seekExact(BytesRef target, TermState state) throws IOException { + if (!target.equals(term)) { + assert state instanceof IntBlockTermState; + termState.copyFrom(state); + term = BytesRef.deepCopyOf(target); + isTermStateCurrent = true; + needReSeekInNext = true; + } + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public int docFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.postings(fieldInfo, termState, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.impacts(fieldInfo, termState, flags); + } + + @Override + public TermState termState() throws IOException { + updateTermStateIfNeeded(); + return termState.clone(); + } + + @Override + public BytesRef next() throws IOException { + if (needReSeekInNext) { + fstSeekState = fstEnum.seekExact(term); + assert fstSeekState != null; + } + fstSeekState = fstEnum.next(); + term = fstSeekState == null ? null : fstSeekState.input; + isTermStateCurrent = false; + needReSeekInNext = false; + return term; + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException("By ord lookup not supported."); + } + } + + final class RandomAccessIntersectTermsEnum extends TermsEnum { + private AttributeSource attrs; + + private boolean isTermStateCurrent; + + private IntBlockTermState termState; + + private BytesRef term; + + private final PrimitiveLongFST fst; + + private final TermDataReaderProvider.TermDataReader termDataReader; + + private final PrimitiveLongFSTIntersectEnum fstFsaIntersectEnum; + + RandomAccessIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) + throws IOException { + TransitionAccessor transitionAccessor = compiled.getTransitionAccessor(); + // assert transitionAccessor.getNumTransitions(0) == 1; + termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); + fst = termsDict.termsIndex().primitiveLongFST(); + termDataReader = termsDict.termDataReaderProvider().newReader(); + fstFsaIntersectEnum = new PrimitiveLongFSTIntersectEnum(fst, compiled, startTerm); + } + + @Override + public BytesRef next() throws IOException { + if (fstFsaIntersectEnum.next()) { + term = fstFsaIntersectEnum.getTerm(); + isTermStateCurrent = false; + } else { + term = null; + } + return term; + } + + void updateTermStateIfNeeded() throws IOException { + if (!isTermStateCurrent) { + long fstOutput = fstFsaIntersectEnum.getFSTOutput(); + TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstOutput); + termState = + termDataReader.getTermState( + typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions()); + isTermStateCurrent = true; + } + } + + @Override + public AttributeSource attributes() { + if (attrs == null) { + attrs = new AttributeSource(); + } + return attrs; + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public int docFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.postings(fieldInfo, termState, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.impacts(fieldInfo, termState, flags); + } + + @Override + public TermState termState() throws IOException { + updateTermStateIfNeeded(); + return termState.clone(); + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException("By ord lookup not supported."); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(BytesRef target, TermState state) throws IOException { + throw new UnsupportedOperationException(); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java new file mode 100644 index 000000000000..a802026f9cb2 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; + +record TermsIndex(FST fst) { + + TypeAndOrd getTerm(BytesRef term) throws IOException { + long encoded = Util.get(fst, term); + return decodeLong(encoded); + } + + record TypeAndOrd(TermType termType, long ord) {} + + void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException { + if (fst != null) { + fst.save(metaOut, dataOut); + } + } + + static TypeAndOrd decodeLong(long encoded) { + TermType termType = TermType.fromId((int) ((encoded & 0b1110L) >>> 1)); + long ord = encoded >>> 4; + return new TypeAndOrd(termType, ord); + } + + static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap) + throws IOException { + FST fst; + if (loadOffHeap) { + var fstStore = new OffHeapFSTStore(); + fst = + new FST<>( + FST.readMetadata(metaIn, PositiveIntOutputs.getSingleton()), + dataIn.clone(), + fstStore); + dataIn.skipBytes(fstStore.size()); + } else { + fst = new FST<>(FST.readMetadata(metaIn, PositiveIntOutputs.getSingleton()), dataIn); + } + return new TermsIndex(fst); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java new file mode 100644 index 000000000000..68bf66a3cbec --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; + +/** + * Builds a term index for a given field. Logically this is a map: term -> (type, ord) where the + * ordinals are scoped to type (not global). + */ +final class TermsIndexBuilder { + private static final long MAX_ORD = (1L << 60) - 1; + + private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; + private final FSTCompiler fstCompiler; + + TermsIndexBuilder() throws IOException { + fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build(); + Arrays.fill(countPerType, -1); + } + + public void addTerm(BytesRef term, TermType termType) throws IOException { + IntsRefBuilder scratchInts = new IntsRefBuilder(); + long ord = ++countPerType[termType.getId()]; + fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType)); + } + + public TermsIndex build() throws IOException { + return new TermsIndex(fstCompiler.compile()); + } + + static long encode(long ord, TermType termType) { + // use a single long to encode `ord` and `termType` + // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0` + // so it looks like this |... ord ...| termType| ... hasOutput ...| + // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord + if (ord < 0) { + throw new IllegalArgumentException("can't encode negative ord: " + ord); + } + if (ord > MAX_ORD) { + throw new IllegalArgumentException( + "Input ord " + + ord + + " is too large for TermType: " + + termType.getId() + + ", max ord allowed is 2^60 - 1"); + } + return (ord << 4) | ((long) termType.getId() << 1) | 1L; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java new file mode 100644 index 000000000000..95e307d786d1 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermsIndex.decodeLong; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.OffHeapFSTStore; +import org.apache.lucene.util.fst.PrimitiveLongFST; + +record TermsIndexPrimitive(PrimitiveLongFST primitiveLongFST) { + + TermsIndex.TypeAndOrd getTerm(BytesRef term) throws IOException { + long encoded = PrimitiveLongFST.get(primitiveLongFST, term); + return decodeLong(encoded); + } + + static TermsIndexPrimitive deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap) + throws IOException { + PrimitiveLongFST fst; + if (loadOffHeap) { + var fstStore = new OffHeapFSTStore(); + fst = + new PrimitiveLongFST( + PrimitiveLongFST.readMetadata( + metaIn, PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()), + dataIn.clone(), + fstStore); + dataIn.skipBytes(fstStore.size()); + } else { + fst = + new PrimitiveLongFST( + PrimitiveLongFST.readMetadata( + metaIn, PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()), + dataIn); + } + return new TermsIndexPrimitive(fst); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java new file mode 100644 index 000000000000..b1881475f74e --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.BytesRef; + +/** Data class that holds starts for term stats for a field */ +record TermsStats( + int fieldNumber, + long size, + long sumTotalTermFreq, + long sumDocFreq, + int docCount, + BytesRef minTerm, + BytesRef maxTerm) { + + void serialize(DataOutput output) throws IOException { + output.writeVInt(fieldNumber); + output.writeVLong(size); + output.writeVLong(sumTotalTermFreq); + output.writeVLong(sumDocFreq); + output.writeVInt(docCount); + if (minTerm != null) { + writeBytesRef(output, minTerm); + } + if (maxTerm != null) { + writeBytesRef(output, maxTerm); + } + } + + static TermsStats deserialize(DataInput input) throws IOException { + int fieldNumber = input.readVInt(); + long size = input.readVLong(); + long sumTotalTermFreq = input.readVLong(); + long sumDocFreq = input.readVLong(); + int docCount = input.readVInt(); + BytesRef minTerm = null, maxTerm = null; + if (size > 0) { + minTerm = readBytesRef(input); + maxTerm = readBytesRef(input); + } + return new TermsStats( + fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm); + } + + static void writeBytesRef(DataOutput output, BytesRef bytes) throws IOException { + output.writeVInt(bytes.length); + output.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } + + static BytesRef readBytesRef(DataInput input) throws IOException { + int numBytes = input.readVInt(); + if (numBytes < 0) { + throw new CorruptIndexException("invalid bytes length: " + numBytes, input); + } + + byte[] bytes = new byte[numBytes]; + input.readBytes(bytes, 0, numBytes); + + return new BytesRef(bytes); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java new file mode 100644 index 000000000000..1ad8b0fb36e8 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.io.IOException; + +/** Interface for bit-packing */ +public interface BitPacker { + + /** Pack the low `numBits` bits of `value` */ + void add(long value, int numBits) throws IOException; + + /** Flush any pending byte */ + void flush() throws IOException; +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java new file mode 100644 index 000000000000..dc405c717072 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.io.IOException; + +/** + * Implementation of {@link BitPacker}. The behavior the is abstracted out here is how to write a + * byte. This is useful as we can wire the byte-writing to byte[], stream or IndexInput, etc. + */ +abstract class BitPackerImplBase implements BitPacker { + private byte buffer; + private int bufferNumBitsUsed; + + abstract void writeByte(byte b) throws IOException; + + /** {@inheritDoc}. value could be larger than 2^numBits - 1 but the higher bits won't be used. */ + @Override + public void add(long value, int numBits) throws IOException { + assert numBits < 64; + // clear bits higher than `numBits` + value &= (1L << numBits) - 1; + + while (numBits > 0) { + int bufferNumBitsRemaining = 8 - bufferNumBitsUsed; + if (numBits < bufferNumBitsRemaining) { + buffer |= (byte) (value << bufferNumBitsUsed); + bufferNumBitsUsed += numBits; + break; + } else { + long mask = (1L << bufferNumBitsRemaining) - 1; + buffer |= (byte) ((value & mask) << bufferNumBitsUsed); + numBits -= bufferNumBitsRemaining; + value >>>= bufferNumBitsRemaining; + writeByte(buffer); + buffer = 0; + bufferNumBitsUsed = 0; + } + } + } + + @Override + public void flush() throws IOException { + if (bufferNumBitsUsed > 0) { + writeByte(buffer); + bufferNumBitsUsed = 0; + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java new file mode 100644 index 000000000000..b5af7b40e385 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import org.apache.lucene.util.BytesRef; + +/** Interface for bit-unpacking */ +public interface BitUnpacker { + + /** Unpack a long in the given bytesRef from a range of bits. */ + long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth); +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java new file mode 100644 index 000000000000..d3a5ab210776 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import org.apache.lucene.util.BytesRef; + +/** Implementation of {@link BitUnpacker} that works with compactly packed bits */ +public class BitUnpackerImpl implements BitUnpacker { + public static BitUnpackerImpl INSTANCE = new BitUnpackerImpl(); + + private BitUnpackerImpl() {} + + @Override + public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) { + assert (startBitIndex + bitWidth) <= bytesRef.length * 8; + assert bitWidth < 64; + + if (bitWidth == 0) { + return 0; + } + + int firstByteIndex = startBitIndex / 8; + int numBitsToExcludeInFirstByte = startBitIndex % 8; + int lastByteIndex = (startBitIndex + bitWidth) / 8; + int numBitsToKeepInLastByte = (startBitIndex + bitWidth) % 8; + + /* + * idea: there are two cases + * (1) when the requests bits are within the same byte; e.g. startBitIndex = 1, bitWidth = 5 + * (2) when the requests bits span across many bytes; e.g. startBitIndex = 1, bitWidth = 15 + * For (1) it is trivial, + * for (2) we can + * (2.1) read first partial bytes + * (2.2) read full bytes for those whose index is in (first, last), exclusive. + * (2.3) read the last partial bytes ( can be empty ) + */ + + // case (1) + if (firstByteIndex == lastByteIndex) { + long res = Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + firstByteIndex]); + res &= (1L << numBitsToKeepInLastByte) - 1; + res >>>= numBitsToExcludeInFirstByte; + return res; + } + + // case (2) + long res = 0; + int totalNumBitsRead = 0; + // (2.1) read first partial bytes + res |= + Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + firstByteIndex]) + >>> numBitsToExcludeInFirstByte; + totalNumBitsRead += 8 - numBitsToExcludeInFirstByte; + // (2.2) read full bytes for whose index is in (first, last), exclusive. + for (int byteIndex = firstByteIndex + 1; byteIndex < lastByteIndex; byteIndex++) { + res |= Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + byteIndex]) << totalNumBitsRead; + totalNumBitsRead += 8; + } + // (2.3) read the last partial bytes ( can be empty ) + if (numBitsToKeepInLastByte > 0) { + long partial = + Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + lastByteIndex]) + & ((1L << numBitsToKeepInLastByte) - 1); + res |= partial << totalNumBitsRead; + } + + return res; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java new file mode 100644 index 000000000000..8e92b9faa326 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** + * A {@link BitPacker} implementation that writes to a {@link org.apache.lucene.store.DataOutput} + */ +public final class DataOutputBitPacker extends BitPackerImplBase { + private final DataOutput dataOut; + + private long numBytesWritten; + + public DataOutputBitPacker(DataOutput dataOut) { + this.dataOut = dataOut; + } + + @Override + void writeByte(byte b) throws IOException { + dataOut.writeByte(b); + numBytesWritten++; + } + + public long getNumBytesWritten() { + return numBytesWritten; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java new file mode 100644 index 000000000000..b40075a1ee8d --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +/** + * A {@link BitPacker} implementation that requires user to know the size of the resulting byte + * array upfront, in order to avoid allocation and copying for dynamically growing the array. + */ +public final class FixedSizeByteArrayBitPacker extends BitPackerImplBase { + private final byte[] bytes; + private int numBytesUsed; + + public FixedSizeByteArrayBitPacker(int capacity) { + this.bytes = new byte[capacity]; + } + + @Override + void writeByte(byte b) { + assert numBytesUsed < bytes.length; + bytes[numBytesUsed++] = b; + } + + public byte[] getBytes() { + return bytes; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java new file mode 100644 index 000000000000..8a9078ffa33c --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Code for packing and unpacking sequence of non-negative integers with smaller bit width. */ +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java new file mode 100644 index 000000000000..a85027e3b5e1 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A PostingFormat that is based on {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat} + * but provides random access term dictionary. + */ +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index fcd5ded3605c..bf0e25322963 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.simpletext.SimpleTextCodec +org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessTermDictCodec diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 09f2491c8012..e060907b8032 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -19,3 +19,4 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat org.apache.lucene.codecs.memory.FSTPostingsFormat org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat +org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java new file mode 100644 index 000000000000..226a4700813c --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDict.TermDataInput; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDict.TermDataInputProvider; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutput; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutputProvider; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; + +public class TestRandomAccessTermsDictWriter extends LuceneTestCase { + int nextFieldNumber; + + public void testBuildIndexAndReadMultipleFields() throws IOException { + try (Directory testDir = newDirectory()) { + IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT); + IndexOutput termIndexOut = testDir.createOutput("term_index", IOContext.DEFAULT); + HashMap termDataOutputsMap = new HashMap<>(); + TermDataOutputProvider outputProvider = + termType -> + termDataOutputsMap.computeIfAbsent( + termType, + t -> { + try { + return new TermDataOutput( + testDir.createOutput("term_meta_" + t.getId(), IOContext.DEFAULT), + testDir.createOutput("term_data_" + t.getId(), IOContext.DEFAULT)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + ExpectedResults[] manyExpectedResults = new ExpectedResults[random().nextInt(1, 20)]; + for (int i = 0; i < manyExpectedResults.length; i++) { + manyExpectedResults[i] = indexOneField(metaOut, termIndexOut, outputProvider); + } + + metaOut.close(); + termIndexOut.close(); + for (var e : termDataOutputsMap.values()) { + e.dataOutput().close(); + e.metadataOutput().close(); + } + + IndexInput metaInput = testDir.openInput("segment_meta", IOContext.READ); + IndexInput termIndexInput = testDir.openInput("term_index", IOContext.LOAD); + HashMap termDataInputsMap = new HashMap<>(); + TermDataInputProvider termDataInputProvider = + termType -> + termDataInputsMap.computeIfAbsent( + termType, + t -> { + try { + return new TermDataInput( + testDir.openInput("term_meta_" + t.getId(), IOContext.LOAD), + testDir.openInput("term_data_" + t.getId(), IOContext.LOAD)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + for (var expectedResult : manyExpectedResults) { + assertDeserializedMatchingExpected( + expectedResult, metaInput, termIndexInput, termDataInputProvider); + } + + metaInput.close(); + termIndexInput.close(); + for (var e : termDataInputsMap.values()) { + e.metadataInput().close(); + e.dataInput().close(); + } + } + } + + private static void assertDeserializedMatchingExpected( + ExpectedResults result, + IndexInput metaInput, + IndexInput termIndexInput, + TermDataInputProvider termDataInputProvider) + throws IOException { + RandomAccessTermsDict deserialized = + RandomAccessTermsDict.deserialize( + new RandomAccessTermsDict.IndexOptionsProvider() { + @Override + public IndexOptions getIndexOptions(int fieldNumber) { + return result.indexOptions; + } + + @Override + public boolean hasPayloads(int fieldNumber) { + return result.hasPayloads(); + } + }, + metaInput, + termIndexInput, + termDataInputProvider); + + assertEquals(result.fieldNumber(), deserialized.termsStats().fieldNumber()); + assertEquals(result.expectedDocCount(), deserialized.termsStats().docCount()); + assertEquals(result.expectedTermAndState().length, deserialized.termsStats().size()); + assertEquals( + Arrays.stream(result.expectedTermAndState()).mapToLong(x -> x.state.docFreq).sum(), + deserialized.termsStats().sumDocFreq()); + assertEquals( + Arrays.stream(result.expectedTermAndState()).mapToLong(x -> x.state.totalTermFreq).sum(), + deserialized.termsStats().sumTotalTermFreq()); + assertEquals(result.expectedTermAndState().length, deserialized.termsStats().size()); + assertEquals(result.expectedTermAndState()[0].term, deserialized.termsStats().minTerm()); + assertEquals( + result.expectedTermAndState()[result.expectedTermAndState().length - 1].term, + deserialized.termsStats().maxTerm()); + + for (var x : result.expectedTermAndState()) { + IntBlockTermState expectedState = x.state; + IntBlockTermState actualState = deserialized.getTermState(x.term); + if (expectedState.singletonDocID != -1) { + assertEquals(expectedState.singletonDocID, actualState.singletonDocID); + } else { + assertEquals(expectedState.docStartFP, actualState.docStartFP); + } + assertEquals(expectedState.docFreq, actualState.docFreq); + if (result.indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + assertEquals(expectedState.totalTermFreq, actualState.totalTermFreq); + } + assertEquals(expectedState.skipOffset, actualState.skipOffset); + if (result.indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + assertEquals(expectedState.posStartFP, actualState.posStartFP); + assertEquals(expectedState.lastPosBlockOffset, actualState.lastPosBlockOffset); + } + if (result.indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + assertEquals(expectedState.payStartFP, actualState.payStartFP); + } + } + } + + private ExpectedResults indexOneField( + IndexOutput metaOut, IndexOutput termIndexOut, TermDataOutputProvider outputProvider) + throws IOException { + int fieldNumber = nextFieldNumber++; + IndexOptions indexOptions = + IndexOptions.values()[random().nextInt(1, IndexOptions.values().length)]; + boolean hasPayloads = random().nextBoolean(); + if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + hasPayloads = false; + } + RandomAccessTermsDictWriter randomAccessTermsDictWriter = + new RandomAccessTermsDictWriter( + fieldNumber, indexOptions, hasPayloads, metaOut, termIndexOut, outputProvider); + + TermAndState[] expectedTermAndState = getRandoms(1000, 2000); + int expectedDocCount = random().nextInt(1, 2000); + + for (var x : expectedTermAndState) { + randomAccessTermsDictWriter.add(x.term, x.state); + } + randomAccessTermsDictWriter.finish(expectedDocCount); + return new ExpectedResults( + fieldNumber, indexOptions, hasPayloads, expectedTermAndState, expectedDocCount); + } + + private record ExpectedResults( + int fieldNumber, + IndexOptions indexOptions, + boolean hasPayloads, + TermAndState[] expectedTermAndState, + int expectedDocCount) {} + + static TermAndState[] getRandoms(int size, int maxDoc) { + IntBlockTermState lastTermState = null; + + ArrayList result = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + byte[] termBytes = new byte[4]; + BitUtil.VH_BE_INT.set(termBytes, 0, i); + + IntBlockTermState termState = new IntBlockTermState(); + termState.docFreq = random().nextInt(1, 100); + if (termState.docFreq == 1) { + termState.singletonDocID = random().nextInt(0, maxDoc); + } else { + termState.singletonDocID = -1; + } + if (lastTermState == null) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } else { + termState.docStartFP = lastTermState.docStartFP; + termState.posStartFP = lastTermState.posStartFP; + termState.payStartFP = lastTermState.payStartFP; + termState.docStartFP += termState.docFreq == 1 ? 0 : random().nextLong(1, 256); + termState.posStartFP += random().nextLong(1, 256); + termState.payStartFP += random().nextLong(1, 256); + } + termState.totalTermFreq = random().nextLong(termState.docFreq, 1000); + if (termState.docFreq > 1 && random().nextBoolean()) { + termState.skipOffset = random().nextLong(1, 256); + } else { + termState.skipOffset = -1; + } + if (random().nextBoolean()) { + termState.lastPosBlockOffset = random().nextLong(1, 256); + } else { + termState.lastPosBlockOffset = -1; + } + lastTermState = termState; + result.add(new TermAndState(new BytesRef(termBytes), termState)); + } + + return result.toArray(TermAndState[]::new); + } + + record TermAndState(BytesRef term, IntBlockTermState state) {} +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java new file mode 100644 index 000000000000..fc1b7b0f269b --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TestTermStateCodecImpl.TermStateTestFixture; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.ArrayUtil; + +public class TestTermDataWriter extends LuceneTestCase { + + public void testWriterAndDeserialize() throws IOException { + TermStateTestFixture testFixture = TestTermStateCodecImpl.getTermStateTestFixture(777); + + try (Directory testDir = newDirectory()) { + IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT); + IndexOutput metadataOut = testDir.createOutput("term_meta_1", IOContext.DEFAULT); + IndexOutput dataOut = testDir.createOutput("term_data_11", IOContext.DEFAULT); + TermDataWriter writer = new TermDataWriter(testFixture.codec(), metadataOut, dataOut); + for (var termState : testFixture.termStatesArray()) { + writer.addTermState(termState); + } + writer.finish(); + metaOut.writeVLong(writer.getTotalMetaDataBytesWritten()); + metaOut.writeVLong(writer.getTotalDataBytesWritten()); + metaOut.close(); + metadataOut.close(); + dataOut.close(); + + BitPerBytePacker referenceBitPacker = new BitPerBytePacker(); + // total size 777; there will be 4 blocks total. + // The extra 8 byte per block is the long offset for where the block starts within data bytes. + byte[] expectedMetadata = new byte[(testFixture.codec().getMetadataBytesLength() + 8) * 4]; + ByteArrayDataOutput expectedMetadataOut = new ByteArrayDataOutput(expectedMetadata); + for (int start = 0; + start < testFixture.termStatesArray().length; + start += TermDataWriter.NUM_TERMS_PER_BLOCK) { + expectedMetadataOut.writeLong(referenceBitPacker.getCompactBytes().length); + byte[] metadata = + testFixture + .codec() + .encodeBlock( + ArrayUtil.copyOfSubArray( + testFixture.termStatesArray(), + start, + Math.min( + start + TermDataWriter.NUM_TERMS_PER_BLOCK, + testFixture.termStatesArray().length)), + referenceBitPacker); + expectedMetadataOut.writeBytes(metadata, 0, metadata.length); + } + ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes()); + ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata); + TermData expected = new TermData(expectedMetadataSlice, expectedDataSlice); + + IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT); + IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT); + IndexInput dataIn = testDir.openInput("term_data_11", IOContext.DEFAULT); + + TermDataProvider actualProvider = + TermDataProvider.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); + assertByteSlice(expected.metadata(), actualProvider.metadataProvider().newByteSlice()); + assertByteSlice(expected.data(), actualProvider.dataProvider().newByteSlice()); + testDecodeTermState(testFixture, actualProvider); + + actualProvider = + TermDataProvider.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); + assertByteSlice(expected.metadata(), actualProvider.metadataProvider().newByteSlice()); + assertByteSlice(expected.data(), actualProvider.dataProvider().newByteSlice()); + testDecodeTermState(testFixture, actualProvider); + + metaIn.close(); + metadataIn.close(); + dataIn.close(); + } + } + + private static void testDecodeTermState( + TermStateTestFixture testFixture, TermDataProvider actualProvider) throws IOException { + TermData actual = + new TermData( + actualProvider.metadataProvider().newByteSlice(), + actualProvider.dataProvider().newByteSlice()); + for (int i = 0; i < testFixture.termStatesArray().length; i++) { + IntBlockTermState expectedTermState = testFixture.termStatesArray()[i]; + IntBlockTermState decoded = actual.getTermState(testFixture.codec(), i); + assertEquals(expectedTermState.docFreq, decoded.docFreq); + assertEquals(expectedTermState.docStartFP, decoded.docStartFP); + } + } + + private static void assertByteSlice(ByteSlice expected, ByteSlice actual) throws IOException { + assertEquals(expected.size(), actual.size()); + byte[] bytesExpected = new byte[(int) expected.size()]; + ByteArrayDataOutput out = new ByteArrayDataOutput(bytesExpected); + expected.writeAll(out); + + byte[] bytesActual = new byte[(int) actual.size()]; + ByteArrayDataOutput out1 = new ByteArrayDataOutput(bytesActual); + actual.writeAll(out1); + assertArrayEquals(bytesExpected, bytesActual); + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java new file mode 100644 index 000000000000..330017025cd6 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.util.stream.LongStream; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestTermStateCodecComponent extends LuceneTestCase { + + public void testGetBitWidth() { + int expectedMaxBits = random().nextInt(31) + 1; + int bitMask = 0xFFFFFFFF >>> (32 - expectedMaxBits); + int highestBit = (bitMask >>> 1) + 1; + + IntBlockTermState[] termStates = + random() + .ints(256) + .mapToObj( + docFreq -> { + var x = new IntBlockTermState(); + x.docFreq = (docFreq & bitMask) | highestBit; + return x; + }) + .toArray(IntBlockTermState[]::new); + + byte bitWidth = + TermStateCodecComponent.getBitWidth( + termStates, termStates.length, TermStateCodecComponent.DocFreq.INSTANCE); + assertEquals(expectedMaxBits, bitWidth); + } + + public void testGetBitWidthWithIncreasingValues() { + long baseValue = random().nextLong(Long.MAX_VALUE >> 1); + int expectedMaxBits = random().nextInt(63) + 1; + long bitMask = 0xFFFFFFFF_FFFFFFFFL >>> (64 - expectedMaxBits); + long highestBit = (bitMask >>> 1) + 1; + + var randomLongs = + random() + .longs(256, 0, Long.MAX_VALUE - baseValue) + .map(x -> baseValue + ((x & bitMask) | highestBit)) + .sorted(); + + IntBlockTermState[] termStates = + LongStream.concat(LongStream.of(baseValue), randomLongs) + .mapToObj( + docStartFP -> { + var x = new IntBlockTermState(); + x.docStartFP = docStartFP; + return x; + }) + .toArray(IntBlockTermState[]::new); + + byte bitWidth = + TermStateCodecComponent.getBitWidth( + termStates, termStates.length, TermStateCodecComponent.DocStartFP.INSTANCE); + assertEquals(expectedMaxBits, bitWidth); + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java new file mode 100644 index 000000000000..f9d1c416cda7 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -0,0 +1,334 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import java.util.ArrayList; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.FixedSizeByteArrayBitPacker; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestTermStateCodecImpl extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + TermStateTestFixture result = getTermStateTestFixture(256); + + BitPerBytePacker bitPerBytePacker = new BitPerBytePacker(); + byte[] metadata = result.codec().encodeBlock(result.termStatesArray(), bitPerBytePacker); + + // For the metadata, we expect + // 0: DocFreq.bitWidth, + // 1: DocStartFP.bitWidth, + // [2-10]: DocStartFP.referenceValue; + int expectedDocFreqBitWidth = 64 - Long.numberOfLeadingZeros(result.maxDocFreqSeen()); + int expectedDocStartFPBitWidth = + 64 - Long.numberOfLeadingZeros(result.maxDocStartFPDeltaSeen()); + assertEquals(10, metadata.length); + assertEquals(expectedDocFreqBitWidth, metadata[0]); + assertEquals(expectedDocStartFPBitWidth, metadata[1]); + ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8); + assertEquals(result.docStartFPBase(), byteArrayDataInput.readLong()); + + // Assert with real bit-packer we get the same bytes + FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = + new FixedSizeByteArrayBitPacker(bitPerBytePacker.getCompactBytes().length); + result.codec().encodeBlock(result.termStatesArray(), fixedSizeByteArrayBitPacker); + assertArrayEquals(bitPerBytePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes()); + + // Assert that each term state is the same after the encode-decode roundtrip. + BytesRef metadataBytes = new BytesRef(metadata); + BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes()); + assertBlockRoundTrip( + result.termStatesArray(), result.codec(), metadataBytes, dataBytes, bitPerBytePacker); + + // With real compact bits instead of bit-per-byte + dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes()); + assertBlockRoundTrip( + result.termStatesArray(), + result.codec(), + metadataBytes, + dataBytes, + BitUnpackerImpl.INSTANCE); + + // Also test decoding that doesn't begin at the start of the block. + int pos = random().nextInt(result.termStatesArray().length); + int startBitIndex = pos > 0 ? random().nextInt(pos) : 0; + int recordSize = expectedDocFreqBitWidth + expectedDocStartFPBitWidth; + // With bit-per-byte bytes + dataBytes = + new BytesRef(bitPerBytePacker.getBytes(), pos * recordSize - startBitIndex, recordSize); + assertDecodeAt( + result.codec(), + metadataBytes, + dataBytes, + bitPerBytePacker, + startBitIndex, + result.termStatesArray()[pos]); + + // With compact bytes + int startByteIndex = pos * recordSize / 8; + int endByteIndex = (pos + 1) * recordSize / 8; + int length = endByteIndex - startByteIndex + ((pos + 1) * recordSize % 8 == 0 ? 0 : 1); + dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes(), startByteIndex, length); + assertDecodeAt( + result.codec(), + metadataBytes, + dataBytes, + BitUnpackerImpl.INSTANCE, + (pos * recordSize) % 8, + result.termStatesArray()[pos]); + } + + public static TermStateTestFixture getTermStateTestFixture(int size) { + TermStateCodecImpl codec = + new TermStateCodecImpl( + new TermStateCodecComponent[] { + TermStateCodecComponent.DocFreq.INSTANCE, TermStateCodecComponent.DocStartFP.INSTANCE, + }); + + ArrayList termStates = new ArrayList<>(); + long maxDocFreqSeen = -1; + long docStartFPBase = random().nextLong(Long.MAX_VALUE >> 1); + long maxDocStartFPDeltaSeen = -1; + for (int i = 0; i < size; i++) { + var termState = new IntBlockTermState(); + termState.docFreq = random().nextInt(1, 1 << random().nextInt(1, 31)); + if (i == 0) { + termState.docStartFP = docStartFPBase; + } else { + termState.docStartFP = termStates.get(i - 1).docStartFP + random().nextLong(1024); + maxDocStartFPDeltaSeen = + Math.max(maxDocStartFPDeltaSeen, termState.docStartFP - docStartFPBase); + } + maxDocFreqSeen = Math.max(maxDocFreqSeen, termState.docFreq); + termStates.add(termState); + } + + IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new); + return new TermStateTestFixture( + codec, maxDocFreqSeen, docStartFPBase, maxDocStartFPDeltaSeen, termStatesArray); + } + + public record TermStateTestFixture( + TermStateCodecImpl codec, + long maxDocFreqSeen, + long docStartFPBase, + long maxDocStartFPDeltaSeen, + IntBlockTermState[] termStatesArray) {} + + private static void assertDecodeAt( + TermStateCodecImpl codec, + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker, + int startBitIndex, + IntBlockTermState termState) { + IntBlockTermState decoded = + codec.decodeAt(metadataBytes, dataBytes, bitUnpacker, startBitIndex); + assertEquals(termState.docFreq, decoded.docFreq); + assertEquals(termState.docStartFP, decoded.docStartFP); + } + + private static void assertBlockRoundTrip( + IntBlockTermState[] termStatesArray, + TermStateCodecImpl codec, + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker) { + for (int i = 0; i < termStatesArray.length; i++) { + IntBlockTermState decoded = codec.decodeWithinBlock(metadataBytes, dataBytes, bitUnpacker, i); + assertEquals(termStatesArray[i].docFreq, decoded.docFreq); + assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP); + } + } + + public void testGetCodec() { + for (IndexOptions indexOptions : IndexOptions.values()) { + if (indexOptions == IndexOptions.NONE) { + continue; + } + for (int i = 0; i < 8; i++) { + if ((i & 0b011) == 0b011) { + continue; + } + if ((i & 0b100) == 0b100 + && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + continue; + } + for (int dice = 0; dice < 2; dice++) { + boolean hasPayloads = dice == 0; + if (hasPayloads + && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + continue; + } + TermType termType = TermType.fromId(i); + var expected = getExpectedCodec(termType, indexOptions, hasPayloads); + var got = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads); + assertEquals(expected, got); + } + } + } + } + + // Enumerate the expected Codec we get for (TermType, IndexOptions) pairs. + static TermStateCodecImpl getExpectedCodec( + TermType termType, IndexOptions indexOptions, boolean hasPayloads) { + ArrayList components = new ArrayList<>(); + // Wish I can code this better in java... + switch (termType.getId()) { + // Not singleton doc; No skip data; No last position block offset + case 0b000 -> { + assert !termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && !termType.hasSingletonDoc(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + } + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Singleton doc; No skip data; No last position block offset + case 0b001 -> { + assert !termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && termType.hasSingletonDoc(); + components.add(TermStateCodecComponent.SingletonDocId.INSTANCE); + // If field needs frequency, we need totalTermsFreq. + // Since there is only 1 doc, totalTermsFreq == docFreq. + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + } + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + + // Not Singleton doc; Has skip data; No last position block offset + case 0b010 -> { + assert !termType.hasLastPositionBlockOffset() + && termType.hasSkipData() + && !termType.hasSingletonDoc(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.SkipOffset.INSTANCE); + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + } + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Singleton doc but has skip data; Invalid state. + case 0b011, 0b111 -> { + assert termType.hasSkipData() && termType.hasSingletonDoc(); + throw new IllegalStateException( + "Unreachable. A term has skip data but also only has one doc!? Must be a bug"); + } + // Not singleton doc; No skip data; Has last position block offset; + case 0b100 -> { + assert termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && !termType.hasSingletonDoc(); + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Singleton doc; No skip data; Has last position block offset; + case 0b101 -> { + assert termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && termType.hasSingletonDoc(); + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + components.add(TermStateCodecComponent.SingletonDocId.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Not singleton doc; Has skip data; Has last position block offset; + case 0b110 -> { + assert termType.hasLastPositionBlockOffset() + && termType.hasSkipData() + && !termType.hasSingletonDoc(); + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.SkipOffset.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + default -> throw new IllegalStateException("Unreachable"); + } + + return new TermStateCodecImpl(components.toArray(TermStateCodecComponent[]::new)); + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java new file mode 100644 index 000000000000..1dad8688fc41 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestTermsIndexBuilder extends LuceneTestCase { + + public void testBasics() throws IOException { + String[] termTerms = { + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + }; + + Map termsToType = new HashMap<>(); + Map termsToOrd = new HashMap<>(); + Map typeCounters = new HashMap<>(); + + for (String term : termTerms) { + int termType = random().nextInt(TermType.NUM_TOTAL_TYPES); + termsToType.put(term, termType); + int ord = typeCounters.getOrDefault(termType, -1) + 1; + typeCounters.put(termType, ord); + termsToOrd.put(term, ord); + } + + TermsIndexBuilder builder = new TermsIndexBuilder(); + for (String term : termTerms) { + BytesRef termBytes = new BytesRef(term); + builder.addTerm(termBytes, TermType.fromId(termsToType.get(term))); + } + TermsIndex termsIndex = builder.build(); + + byte[] metaBytes = new byte[4096]; + byte[] dataBytes = new byte[4096]; + DataOutput metaOut = new ByteArrayDataOutput(metaBytes); + DataOutput dataOutput = new ByteArrayDataOutput(dataBytes); + + termsIndex.serialize(metaOut, dataOutput); + + TermsIndexPrimitive termsIndexPrimitive = + TermsIndexPrimitive.deserialize( + new ByteArrayDataInput(metaBytes), new ByteArrayDataInput(dataBytes), false); + + for (String term : termTerms) { + BytesRef termBytes = new BytesRef(term); + TermsIndex.TypeAndOrd typeAndOrd = termsIndexPrimitive.getTerm(termBytes); + + assertEquals(termsToType.get(term).intValue(), typeAndOrd.termType().getId()); + assertEquals((long) termsToOrd.get(term), typeAndOrd.ord()); + } + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java new file mode 100644 index 000000000000..8937c5f9e319 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestTermsStats extends LuceneTestCase { + + public void testRoundTrip() throws IOException { + TermsStats expected = makeRandom(); + + try (Directory dir = newDirectory()) { + IndexOutput output = dir.createOutput("terms_stats", IOContext.DEFAULT); + expected.serialize(output); + output.close(); + + IndexInput input = dir.openInput("terms_stats", IOContext.DEFAULT); + TermsStats actual = TermsStats.deserialize(input); + + assertEquals(expected, actual); + input.close(); + } + } + + private TermsStats makeRandom() { + byte[] minBytes = getRandomBytes(); + byte[] maxBytes = getRandomBytes(); + return new TermsStats( + random().nextInt(1, Integer.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextInt(1, Integer.MAX_VALUE), + new BytesRef(minBytes), + new BytesRef(maxBytes)); + } + + private static byte[] getRandomBytes() { + byte[] minBytes = new byte[random().nextInt(100)]; + random().nextBytes(minBytes); + return minBytes; + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java new file mode 100644 index 000000000000..b1bf4bfa463e --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.util.ArrayList; +import org.apache.lucene.util.BytesRef; + +/** + * A wasteful bit packer that use whole byte to keep a bit. Useful for tests. It uses little-endian + * bit order. + */ +public class BitPerBytePacker implements BitPacker, BitUnpacker { + private final ArrayList buffer = new ArrayList<>(); + + private int totalNumBits = 0; + + @Override + public void add(long value, int numBits) { + assert numBits < 64; + totalNumBits += numBits; + while (numBits-- > 0) { + byte b = (byte) (value & 1L); + value = value >>> 1; + buffer.add(b); + } + } + + @Override + public void flush() { + // No-op as this impl writes a byte per bit + } + + public byte[] getBytes() { + byte[] bytes = new byte[totalNumBits]; + int index = 0; + for (var b : buffer) { + bytes[index++] = b; + } + + return bytes; + } + + public byte[] getCompactBytes() { + if (totalNumBits == 0) { + return new byte[0]; + } + + int len = (totalNumBits - 1) / 8 + 1; // round up + byte[] bytes = new byte[len]; + + int remainingBits = totalNumBits; + int pos = 0; + while (remainingBits >= 8) { + byte b = 0; + int base = pos * 8; + for (int i = 0; i < 8; i++) { + b |= (byte) ((buffer.get(base + i) & 1) << i); + } + bytes[pos++] = b; + remainingBits -= 8; + } + + if (remainingBits > 0) { + byte b = 0; + int base = pos * 8; + for (int i = 0; i < remainingBits; i++) { + b |= (byte) ((buffer.get(base + i) & 1) << i); + } + bytes[pos] = b; + } + + return bytes; + } + + @Override + public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) { + long res = 0; + for (int i = 0; i < bitWidth; i++) { + res |= ((long) (bytesRef.bytes[bytesRef.offset + startBitIndex + i] & 1)) << i; + } + return res; + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java new file mode 100644 index 000000000000..9f50777176ac --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestBitPackerImpl extends LuceneTestCase { + + public void testBasic() throws IOException { + FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = new FixedSizeByteArrayBitPacker(5); + for (int i = 1; i <= 10; i++) { + fixedSizeByteArrayBitPacker.add(i, 4); + } + fixedSizeByteArrayBitPacker.flush(); + + byte[] expectedBytes = new byte[] {0x21, 0x43, 0x65, (byte) 0x87, (byte) 0xA9}; + assertArrayEquals(expectedBytes, fixedSizeByteArrayBitPacker.getBytes()); + } + + public void testRandom() throws IOException { + ValueAndBitWidth[] randomInputs = ValueAndBitWidth.getRandomArray(random(), 1000); + int totalNumberBits = Arrays.stream(randomInputs).mapToInt(ValueAndBitWidth::bitWidth).sum(); + + BitPerBytePacker referencePacker = new BitPerBytePacker(); + int capacity = totalNumberBits / 8 + (totalNumberBits % 8 == 0 ? 0 : 1); + FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = + new FixedSizeByteArrayBitPacker(capacity); + + for (ValueAndBitWidth x : randomInputs) { + referencePacker.add(x.value(), x.bitWidth()); + fixedSizeByteArrayBitPacker.add(x.value(), x.bitWidth()); + } + referencePacker.flush(); + fixedSizeByteArrayBitPacker.flush(); + assertArrayEquals(referencePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes()); + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java new file mode 100644 index 000000000000..2cc106b669e2 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestBitUnpackerImpl extends LuceneTestCase { + + public void testUnpackBasics() { + byte[] bytes = new byte[] {0x21, 0x43, 0x65, (byte) 0x87, (byte) 0xA9}; + BytesRef bytesRef = new BytesRef(bytes); + + for (int i = 1; i <= 10; i++) { + long val = BitUnpackerImpl.INSTANCE.unpack(bytesRef, (i - 1) * 4, 4); + assertEquals((long) i, val); + } + } + + public void testRandom() { + ValueAndBitWidth[] expected = ValueAndBitWidth.getRandomArray(random(), 1000); + + BitPerBytePacker referencePacker = new BitPerBytePacker(); + for (var x : expected) { + referencePacker.add(x.value(), x.bitWidth()); + } + + BytesRef bytes = new BytesRef(referencePacker.getCompactBytes()); + int startBitIndex = 0; + for (var x : expected) { + long unpacked = BitUnpackerImpl.INSTANCE.unpack(bytes, startBitIndex, x.bitWidth()); + startBitIndex += x.bitWidth(); + assertEquals(x.value(), unpacked); + } + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java new file mode 100644 index 000000000000..40bee28660fa --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.util.Random; + +record ValueAndBitWidth(long value, int bitWidth) { + + static ValueAndBitWidth[] getRandomArray(Random random, int size) { + return random + .longs(size, 0, Long.MAX_VALUE) + .mapToObj( + val -> { + int bitWidth = random.nextInt(1, 64); + val &= (1L << bitWidth) - 1; + return new ValueAndBitWidth(val, bitWidth); + }) + .toArray(ValueAndBitWidth[]::new); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java index 3f703bc54b26..4307376cffbf 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java @@ -141,7 +141,7 @@ protected String getFullSliceDescription(String sliceDescription) { * implements absolute reads as seek+read. */ public RandomAccessInput randomAccessSlice(long offset, long length) throws IOException { - final IndexInput slice = slice("randomaccess", offset, length); + final IndexInput slice = slice(null, offset, length); if (slice instanceof RandomAccessInput) { // slice() already supports random access return (RandomAccessInput) slice; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java index 6ff52baebbc5..761cf9b77035 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java @@ -228,6 +228,7 @@ public void getTransition(int state, int index, Transition t) { } else { t.max = points[t.transitionUpto + 1] - 1; } + t.dest = dStates[t.source].transitions[t.transitionUpto]; } private class DState { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java new file mode 100644 index 000000000000..1aa5b03e7bb5 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/** + * Enumerates all input (BytesRef) + output pairs in a {@link PrimitiveLongFST}. + * + * @lucene.experimental + */ +public final class BytesRefPrimitiveLongFSTEnum extends PrimitiveLongFSTEnum { + private final BytesRef current = new BytesRef(10); + private final InputOutput result = new InputOutput(); + private BytesRef target; + + /** Holds a single input (BytesRef) + output pair. */ + public static class InputOutput { + public BytesRef input; + public long output; + } + + /** + * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to + * the biggest term before target. + */ + public BytesRefPrimitiveLongFSTEnum(PrimitiveLongFST fst) { + super(fst); + result.input = current; + current.offset = 1; + } + + public InputOutput current() { + return result; + } + + public InputOutput next() throws IOException { + // System.out.println(" enum.next"); + doNext(); + return setResult(); + } + + /** Seeks to smallest term that's >= target. */ + public InputOutput seekCeil(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekCeil(); + return setResult(); + } + + /** Seeks to biggest term that's <= target. */ + public InputOutput seekFloor(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekFloor(); + return setResult(); + } + + /** + * Seeks to exactly this term, returning null if the term doesn't exist. This is faster than using + * {@link #seekFloor} or {@link #seekCeil} because it short-circuits as soon the match is not + * found. + */ + public InputOutput seekExact(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + if (doSeekExact()) { + assert upto == 1 + target.length; + return setResult(); + } else { + return null; + } + } + + @Override + protected int getTargetLabel() { + if (upto - 1 == target.length) { + return FST.END_LABEL; + } else { + return target.bytes[target.offset + upto - 1] & 0xFF; + } + } + + @Override + protected int getCurrentLabel() { + // current.offset fixed at 1 + return current.bytes[upto] & 0xFF; + } + + @Override + protected void setCurrentLabel(int label) { + current.bytes[upto] = (byte) label; + } + + @Override + protected void grow() { + current.bytes = ArrayUtil.grow(current.bytes, upto + 1); + } + + private InputOutput setResult() { + if (upto == 0) { + return null; + } else { + current.length = upto - 1; + result.output = output[upto]; + return result; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java new file mode 100644 index 000000000000..900675090f97 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java @@ -0,0 +1,1330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Constants; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc.BitTable; + +/** + * HACK! + * + *

A copy of {@link FST} but remove generics to work with primitive types and avoid + * boxing-unboxing. + * + * @lucene.experimental + */ +public final class PrimitiveLongFST implements Accountable { + + final PrimitiveLongFSTMetadata metadata; + + /** Specifies allowed range of each int input label for this FST. */ + public enum INPUT_TYPE { + BYTE1, + BYTE2, + BYTE4 + } + + private static final long BASE_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(PrimitiveLongFST.class); + + static final int BIT_FINAL_ARC = 1 << 0; + static final int BIT_LAST_ARC = 1 << 1; + static final int BIT_TARGET_NEXT = 1 << 2; + + // TODO: we can free up a bit if we can nuke this: + static final int BIT_STOP_NODE = 1 << 3; + + /** This flag is set if the arc has an output. */ + public static final int BIT_ARC_HAS_OUTPUT = 1 << 4; + + static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; + + /** + * Value of the arc flags to declare a node with fixed length (sparse) arcs designed for binary + * search. + */ + // We use this as a marker because this one flag is illegal by itself. + public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT; + + /** + * Value of the arc flags to declare a node with fixed length dense arcs and bit table designed + * for direct addressing. + */ + static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6; + + /** + * Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly + * with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations + * that will not occur at the same time. + */ + static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH; + + // Increment version to change it + private static final String FILE_FORMAT_NAME = "FST"; + private static final int VERSION_START = 6; + private static final int VERSION_LITTLE_ENDIAN = 8; + private static final int VERSION_CONTINUOUS_ARCS = 9; + static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS; + + // Never serialized; just used to represent the virtual + // final node w/ no arcs: + static final long FINAL_END_NODE = -1; + + // Never serialized; just used to represent the virtual + // non-final node w/ no arcs: + static final long NON_FINAL_END_NODE = 0; + + /** If arc has this label then that arc is final/accepted */ + public static final int END_LABEL = -1; + + /** + * A {@link BytesStore}, used during building, or during reading when the FST is very large (more + * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. + */ + private final FSTReader fstReader; + + public final PrimitiveLongFSTOutputs outputs; + + /** Represents a single arc. */ + public static final class PrimitiveLongArc { + + // *** Arc fields. + + private int label; + + private long output; + + private long target; + + private byte flags; + + private long nextFinalOutput; + + private long nextArc; + + private byte nodeFlags; + + // *** Fields for arcs belonging to a node with fixed length arcs. + // So only valid when bytesPerArc != 0. + // nodeFlags == ARCS_FOR_BINARY_SEARCH || nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. + + private int bytesPerArc; + + private long posArcsStart; + + private int arcIdx; + + private int numArcs; + + // *** Fields for a direct addressing node. nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. + + /** + * Start position in the {@link BytesReader} of the presence bits for a direct addressing node, + * aka the bit-table + */ + private long bitTableStart; + + /** First label of a direct addressing node. */ + private int firstLabel; + + /** + * Index of the current label of a direct addressing node. While {@link #arcIdx} is the current + * index in the label range, {@link #presenceIndex} is its corresponding index in the list of + * actually present labels. It is equal to the number of bits set before the bit at {@link + * #arcIdx} in the bit-table. This field is a cache to avoid to count bits set repeatedly when + * iterating the next arcs. + */ + private int presenceIndex; + + /** Returns this */ + public PrimitiveLongArc copyFrom(PrimitiveLongArc other) { + label = other.label(); + target = other.target(); + flags = other.flags(); + output = other.output(); + nextFinalOutput = other.nextFinalOutput(); + nextArc = other.nextArc(); + nodeFlags = other.nodeFlags(); + bytesPerArc = other.bytesPerArc(); + + // Fields for arcs belonging to a node with fixed length arcs. + // We could avoid copying them if bytesPerArc() == 0 (this was the case with previous code, + // and the current code + // still supports that), but it may actually help external uses of FST to have consistent arc + // state, and debugging + // is easier. + posArcsStart = other.posArcsStart(); + arcIdx = other.arcIdx(); + numArcs = other.numArcs(); + bitTableStart = other.bitTableStart; + firstLabel = other.firstLabel(); + presenceIndex = other.presenceIndex; + + return this; + } + + boolean flag(int flag) { + return PrimitiveLongFST.flag(flags, flag); + } + + public boolean isLast() { + return flag(BIT_LAST_ARC); + } + + public boolean isFinal() { + return flag(BIT_FINAL_ARC); + } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append(" target=").append(target()); + b.append(" label=0x").append(Integer.toHexString(label())); + if (flag(BIT_FINAL_ARC)) { + b.append(" final"); + } + if (flag(BIT_LAST_ARC)) { + b.append(" last"); + } + if (flag(BIT_TARGET_NEXT)) { + b.append(" targetNext"); + } + if (flag(BIT_STOP_NODE)) { + b.append(" stop"); + } + if (flag(BIT_ARC_HAS_OUTPUT)) { + b.append(" output=").append(output()); + } + if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + b.append(" nextFinalOutput=").append(nextFinalOutput()); + } + if (bytesPerArc() != 0) { + b.append(" arcArray(idx=") + .append(arcIdx()) + .append(" of ") + .append(numArcs()) + .append(")") + .append("(") + .append( + nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING + ? "da" + : nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs") + .append(")"); + } + return b.toString(); + } + + public int label() { + return label; + } + + public long output() { + return output; + } + + /** Ord/address to target node. */ + public long target() { + return target; + } + + public byte flags() { + return flags; + } + + public long nextFinalOutput() { + return nextFinalOutput; + } + + /** + * Address (into the byte[]) of the next arc - only for list of variable length arc. Or + * ord/address to the next node if label == {@link #END_LABEL}. + */ + long nextArc() { + return nextArc; + } + + /** Where we are in the array; only valid if bytesPerArc != 0. */ + public int arcIdx() { + return arcIdx; + } + + /** + * Node header flags. Only meaningful to check if the value is either {@link + * #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link + * #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0). + */ + public byte nodeFlags() { + return nodeFlags; + } + + /** Where the first arc in the array starts; only valid if bytesPerArc != 0 */ + public long posArcsStart() { + return posArcsStart; + } + + /** + * Non-zero if this arc is part of a node with fixed length arcs, which means all arcs for the + * node are encoded with a fixed number of bytes so that we binary search or direct address. We + * do when there are enough arcs leaving one node. It wastes some bytes but gives faster + * lookups. + */ + public int bytesPerArc() { + return bytesPerArc; + } + + /** + * How many arcs; only valid if bytesPerArc != 0 (fixed length arcs). For a node designed for + * binary search this is the array size. For a node designed for direct addressing, this is the + * label range. + */ + public int numArcs() { + return numArcs; + } + + /** + * First label of a direct addressing node. Only valid if nodeFlags == {@link + * #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}. + */ + int firstLabel() { + return firstLabel; + } + + /** + * Helper methods to read the bit-table of a direct addressing node. Only valid for {@link + * PrimitiveLongArc} with {@link PrimitiveLongArc#nodeFlags()} == {@code + * ARCS_FOR_DIRECT_ADDRESSING}. + */ + static class BitTable { + + /** See {@link BitTableUtil#isBitSet(int, BytesReader)}. */ + static boolean isBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in) + throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.isBitSet(bitIndex, in); + } + + /** + * See {@link BitTableUtil#countBits(int, BytesReader)}. The count of bit set is the number of + * arcs of a direct addressing node. + */ + static int countBits(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.countBits(getNumPresenceBytes(arc.numArcs()), in); + } + + /** See {@link BitTableUtil#countBitsUpTo(int, BytesReader)}. */ + static int countBitsUpTo(int bitIndex, PrimitiveLongArc arc, BytesReader in) + throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.countBitsUpTo(bitIndex, in); + } + + /** See {@link BitTableUtil#nextBitSet(int, int, BytesReader)}. */ + static int nextBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.nextBitSet(bitIndex, getNumPresenceBytes(arc.numArcs()), in); + } + + /** See {@link BitTableUtil#previousBitSet(int, BytesReader)}. */ + static int previousBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in) + throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.previousBitSet(bitIndex, in); + } + + /** Asserts the bit-table of the provided {@link PrimitiveLongArc} is valid. */ + static boolean assertIsValid(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + // First bit must be set. + assert isBitSet(0, arc, in); + // Last bit must be set. + assert isBitSet(arc.numArcs() - 1, arc, in); + // No bit set after the last arc. + assert nextBitSet(arc.numArcs() - 1, arc, in) == -1; + return true; + } + } + } + + private static boolean flag(int flags, int bit) { + return (flags & bit) != 0; + } + + private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; + + /** + * Load a previously saved FST with a DataInput for metdata using an {@link OnHeapFSTStore} with + * maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS} + */ + public PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, DataInput in) throws IOException { + this(metadata, in, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS)); + } + + /** + * Load a previously saved FST with a metdata object and a FSTStore. If using {@link + * OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used + * to hold the FST bytes. + */ + public PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, DataInput in, FSTStore fstStore) + throws IOException { + this(metadata, fstStore.init(in, metadata.numBytes)); + } + + /** Create the FST with a metadata object and a FSTReader. */ + PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, FSTReader fstReader) { + this.metadata = metadata; + this.outputs = metadata.outputs; + this.fstReader = fstReader; + } + + /** + * Read the FST metadata from DataInput + * + * @param metaIn the DataInput of the metadata + * @param outputs the FST outputs + * @return the FST metadata + * @throws IOException if exception occurred during parsing + */ + public static PrimitiveLongFSTMetadata readMetadata( + DataInput metaIn, PrimitiveLongFSTOutputs outputs) throws IOException { + // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have + // back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it + int version = CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT); + Long emptyOutput; + if (metaIn.readByte() == 1) { + // accepts empty string + // 1 KB blocks: + BytesStore emptyBytes = new BytesStore(10); + int numBytes = metaIn.readVInt(); + emptyBytes.copyBytes(metaIn, numBytes); + + // De-serialize empty-string output: + BytesReader reader = emptyBytes.getReverseBytesReader(); + // NoOutputs uses 0 bytes when writing its output, + // so we have to check here else BytesStore gets + // angry: + if (numBytes > 0) { + reader.setPosition(numBytes - 1); + } + emptyOutput = outputs.readFinalOutput(reader); + } else { + emptyOutput = null; + } + INPUT_TYPE inputType; + final byte t = metaIn.readByte(); + switch (t) { + case 0: + inputType = INPUT_TYPE.BYTE1; + break; + case 1: + inputType = INPUT_TYPE.BYTE2; + break; + case 2: + inputType = INPUT_TYPE.BYTE4; + break; + default: + throw new CorruptIndexException("invalid input type " + t, metaIn); + } + long startNode = metaIn.readVLong(); + long numBytes = metaIn.readVLong(); + return new PrimitiveLongFSTMetadata( + inputType, outputs, emptyOutput, startNode, version, numBytes); + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed(); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(input=" + metadata.inputType + ",output=" + outputs; + } + + public long numBytes() { + return metadata.numBytes; + } + + public long getEmptyOutput() { + return metadata.emptyOutput.longValue(); + } + + public PrimitiveLongFSTMetadata getMetadata() { + return metadata; + } + + public void save(DataOutput metaOut, DataOutput out) throws IOException { + saveMetadata(metaOut); + fstReader.writeTo(out); + } + + /** + * Save the metadata to a DataOutput + * + * @param metaOut the DataOutput to save + */ + public void saveMetadata(DataOutput metaOut) throws IOException { + CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); + if (metadata.emptyOutput != null) { + // Accepts empty string + metaOut.writeByte((byte) 1); + + // Serialize empty-string output: + ByteBuffersDataOutput ros = new ByteBuffersDataOutput(); + outputs.writeFinalOutput(metadata.emptyOutput.longValue(), ros); + byte[] emptyOutputBytes = ros.toArrayCopy(); + int emptyLen = emptyOutputBytes.length; + + // reverse + final int stopAt = emptyLen / 2; + int upto = 0; + while (upto < stopAt) { + final byte b = emptyOutputBytes[upto]; + emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1]; + emptyOutputBytes[emptyLen - upto - 1] = b; + upto++; + } + metaOut.writeVInt(emptyLen); + metaOut.writeBytes(emptyOutputBytes, 0, emptyLen); + } else { + metaOut.writeByte((byte) 0); + } + + final byte t; + if (metadata.inputType == INPUT_TYPE.BYTE1) { + t = 0; + } else if (metadata.inputType == INPUT_TYPE.BYTE2) { + t = 1; + } else { + t = 2; + } + metaOut.writeByte(t); + metaOut.writeVLong(metadata.startNode); + metaOut.writeVLong(numBytes()); + } + + /** Writes an automaton to a file. */ + public void save(final Path path) throws IOException { + try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { + DataOutput out = new OutputStreamDataOutput(os); + save(out, out); + } + } + + /** Reads an automaton from a file. */ + public static PrimitiveLongFST read(Path path, PrimitiveLongFSTOutputs outputs) + throws IOException { + try (InputStream is = Files.newInputStream(path)) { + DataInput in = new InputStreamDataInput(new BufferedInputStream(is)); + return new PrimitiveLongFST(readMetadata(in, outputs), in); + } + } + + /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */ + public int readLabel(DataInput in) throws IOException { + final int v; + if (metadata.inputType == INPUT_TYPE.BYTE1) { + // Unsigned byte: + v = in.readByte() & 0xFF; + } else if (metadata.inputType == INPUT_TYPE.BYTE2) { + // Unsigned short: + if (metadata.version < VERSION_LITTLE_ENDIAN) { + v = Short.reverseBytes(in.readShort()) & 0xFFFF; + } else { + v = in.readShort() & 0xFFFF; + } + } else { + v = in.readVInt(); + } + return v; + } + + /** returns true if the node at this address has any outgoing arcs */ + public static boolean targetHasArcs(PrimitiveLongArc arc) { + return arc.target() > 0; + } + + /** + * Gets the number of bytes required to flag the presence of each arc in the given label range, + * one bit per arc. + */ + static int getNumPresenceBytes(int labelRange) { + assert labelRange >= 0; + return (labelRange + 7) >> 3; + } + + /** + * Reads the presence bits of a direct-addressing node. Actually we don't read them here, we just + * keep the pointer to the bit-table start and we skip them. + */ + private void readPresenceBytes(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + arc.bitTableStart = in.getPosition(); + in.skipBytes(getNumPresenceBytes(arc.numArcs())); + } + + /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */ + public PrimitiveLongArc getFirstArc(PrimitiveLongArc arc) { + long NO_OUTPUT = outputs.getNoOutput(); + + if (metadata.emptyOutput != null) { + arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; + arc.nextFinalOutput = metadata.emptyOutput.longValue(); + if (metadata.emptyOutput.longValue() != NO_OUTPUT) { + arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); + } + } else { + arc.flags = BIT_LAST_ARC; + arc.nextFinalOutput = NO_OUTPUT; + } + arc.output = NO_OUTPUT; + + // If there are no nodes, ie, the FST only accepts the + // empty string, then startNode is 0 + arc.target = metadata.startNode; + return arc; + } + + /** + * Follows the follow arc and reads the last arc of its target; this changes the + * provided arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument (arc). + */ + PrimitiveLongArc readLastTargetArc(PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in) + throws IOException { + // System.out.println("readLast"); + if (!targetHasArcs(follow)) { + // System.out.println(" end node"); + assert follow.isFinal(); + arc.label = END_LABEL; + arc.target = FINAL_END_NODE; + arc.output = follow.nextFinalOutput(); + arc.flags = BIT_LAST_ARC; + arc.nodeFlags = arc.flags; + return arc; + } else { + in.setPosition(follow.target()); + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS) { + // Special arc which is actually a node header for fixed length arcs. + // Jump straight to end to find the last arc. + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + // System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + readLastArcByDirectAddressing(arc, in); + } else if (flags == ARCS_FOR_BINARY_SEARCH) { + arc.arcIdx = arc.numArcs() - 2; + arc.posArcsStart = in.getPosition(); + readNextRealArc(arc, in); + } else { + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + readLastArcByContinuous(arc, in); + } + } else { + arc.flags = flags; + // non-array: linear scan + arc.bytesPerArc = 0; + // System.out.println(" scan"); + while (!arc.isLast()) { + // skip this arc: + readLabel(in); + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + if (arc.flag(BIT_STOP_NODE)) { + } else if (arc.flag(BIT_TARGET_NEXT)) { + } else { + readUnpackedNodeTarget(in); + } + arc.flags = in.readByte(); + } + // Undo the byte flags we read: + in.skipBytes(-1); + arc.nextArc = in.getPosition(); + readNextRealArc(arc, in); + } + assert arc.isLast(); + return arc; + } + } + + private long readUnpackedNodeTarget(BytesReader in) throws IOException { + return in.readVLong(); + } + + /** + * Follow the follow arc and read the first arc of its target; this changes the + * provided arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument (arc). + */ + public PrimitiveLongArc readFirstTargetArc( + PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in) throws IOException { + // int pos = address; + // System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + + // follow.isFinal()); + if (follow.isFinal()) { + // Insert "fake" final first arc: + arc.label = END_LABEL; + arc.output = follow.nextFinalOutput(); + arc.flags = BIT_FINAL_ARC; + if (follow.target() <= 0) { + arc.flags |= BIT_LAST_ARC; + } else { + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.target = FINAL_END_NODE; + arc.nodeFlags = arc.flags; + // System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + + // arc.isLast() + " output=" + outputs.outputToString(arc.output)); + return arc; + } else { + return readFirstRealTargetArc(follow.target(), arc, in); + } + } + + private void readFirstArcInfo(long nodeAddress, PrimitiveLongArc arc, final BytesReader in) + throws IOException { + in.setPosition(nodeAddress); + + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS) { + // Special arc which is actually a node header for fixed length arcs. + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + arc.arcIdx = -1; + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.presenceIndex = -1; + } else if (flags == ARCS_FOR_CONTINUOUS) { + arc.firstLabel = readLabel(in); + } + arc.posArcsStart = in.getPosition(); + } else { + arc.nextArc = nodeAddress; + arc.bytesPerArc = 0; + } + } + + public PrimitiveLongArc readFirstRealTargetArc( + long nodeAddress, PrimitiveLongArc arc, final BytesReader in) throws IOException { + readFirstArcInfo(nodeAddress, arc, in); + return readNextRealArc(arc, in); + } + + /** + * Returns whether arc's target points to a node in expanded format (fixed length + * arcs). + */ + boolean isExpandedTarget(PrimitiveLongArc follow, BytesReader in) throws IOException { + if (!targetHasArcs(follow)) { + return false; + } else { + in.setPosition(follow.target()); + byte flags = in.readByte(); + return flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS; + } + } + + /** In-place read; returns the arc. */ + public PrimitiveLongArc readNextArc(PrimitiveLongArc arc, BytesReader in) throws IOException { + if (arc.label() == END_LABEL) { + // This was a fake inserted "final" arc + if (arc.nextArc() <= 0) { + throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); + } + return readFirstRealTargetArc(arc.nextArc(), arc, in); + } else { + return readNextRealArc(arc, in); + } + } + + /** Peeks at next arc's label; does not alter arc. Do not call this if arc.isLast()! */ + int readNextArcLabel(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert !arc.isLast(); + + if (arc.label() == END_LABEL) { + // System.out.println(" nextArc fake " + arc.nextArc); + // Next arc is the first arc of a node. + // Position to read the first arc label. + + in.setPosition(arc.nextArc()); + byte flags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS) { + // System.out.println(" nextArc fixed length arc"); + // Special arc which is actually a node header for fixed length arcs. + int numArcs = in.readVInt(); + in.readVInt(); // Skip bytesPerArc. + if (flags == ARCS_FOR_BINARY_SEARCH) { + in.readByte(); // Skip arc flags. + } else if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + in.skipBytes(getNumPresenceBytes(numArcs)); + } // Nothing to do for ARCS_FOR_CONTINUOUS + } + } else { + switch (arc.nodeFlags()) { + case ARCS_FOR_BINARY_SEARCH: + // Point to next arc, -1 to skip arc flags. + in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * (long) arc.bytesPerArc() - 1); + break; + case ARCS_FOR_DIRECT_ADDRESSING: + // Direct addressing node. The label is not stored but rather inferred + // based on first label and arc index in the range. + assert BitTable.assertIsValid(arc, in); + assert BitTable.isBitSet(arc.arcIdx(), arc, in); + int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); + assert nextIndex != -1; + return arc.firstLabel() + nextIndex; + case ARCS_FOR_CONTINUOUS: + return arc.firstLabel() + arc.arcIdx() + 1; + default: + // Variable length arcs - linear search. + assert arc.bytesPerArc() == 0; + // Arcs have variable length. + // System.out.println(" nextArc real list"); + // Position to next arc, -1 to skip flags. + in.setPosition(arc.nextArc() - 1); + break; + } + } + return readLabel(in); + } + + public PrimitiveLongArc readArcByIndex(PrimitiveLongArc arc, final BytesReader in, int idx) + throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH; + assert idx >= 0 && idx < arc.numArcs(); + in.setPosition(arc.posArcsStart() - idx * (long) arc.bytesPerArc()); + arc.arcIdx = idx; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads a Continuous node arc, with the provided index in the label range. + * + * @param rangeIndex The index of the arc in the label range. It must be within the label range. + */ + public PrimitiveLongArc readArcByContinuous( + PrimitiveLongArc arc, final BytesReader in, int rangeIndex) throws IOException { + assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); + in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc()); + arc.arcIdx = rangeIndex; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads a present direct addressing node arc, with the provided index in the label range. + * + * @param rangeIndex The index of the arc in the label range. It must be present. The real arc + * offset is computed based on the presence bits of the direct addressing node. + */ + public PrimitiveLongArc readArcByDirectAddressing( + PrimitiveLongArc arc, final BytesReader in, int rangeIndex) throws IOException { + assert BitTable.assertIsValid(arc, in); + assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); + assert BitTable.isBitSet(rangeIndex, arc, in); + int presenceIndex = BitTable.countBitsUpTo(rangeIndex, arc, in); + return readArcByDirectAddressing(arc, in, rangeIndex, presenceIndex); + } + + /** + * Reads a present direct addressing node arc, with the provided index in the label range and its + * corresponding presence index (which is the count of presence bits before it). + */ + private PrimitiveLongArc readArcByDirectAddressing( + PrimitiveLongArc arc, final BytesReader in, int rangeIndex, int presenceIndex) + throws IOException { + in.setPosition(arc.posArcsStart() - presenceIndex * (long) arc.bytesPerArc()); + arc.arcIdx = rangeIndex; + arc.presenceIndex = presenceIndex; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads the last arc of a direct addressing node. This method is equivalent to call {@link + * #readArcByDirectAddressing(PrimitiveLongArc, BytesReader, int)} with {@code rangeIndex} equal + * to {@code arc.numArcs() - 1}, but it is faster. + */ + public PrimitiveLongArc readLastArcByDirectAddressing(PrimitiveLongArc arc, final BytesReader in) + throws IOException { + assert BitTable.assertIsValid(arc, in); + int presenceIndex = BitTable.countBits(arc, in) - 1; + return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex); + } + + /** Reads the last arc of a continuous node. */ + public PrimitiveLongArc readLastArcByContinuous(PrimitiveLongArc arc, final BytesReader in) + throws IOException { + return readArcByContinuous(arc, in, arc.numArcs() - 1); + } + + /** Never returns null, but you should never call this if arc.isLast() is true. */ + public PrimitiveLongArc readNextRealArc(PrimitiveLongArc arc, final BytesReader in) + throws IOException { + + // TODO: can't assert this because we call from readFirstArc + // assert !flag(arc.flags, BIT_LAST_ARC); + + switch (arc.nodeFlags()) { + case ARCS_FOR_BINARY_SEARCH: + case ARCS_FOR_CONTINUOUS: + assert arc.bytesPerArc() > 0; + arc.arcIdx++; + assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs(); + in.setPosition(arc.posArcsStart() - arc.arcIdx() * (long) arc.bytesPerArc()); + arc.flags = in.readByte(); + break; + + case ARCS_FOR_DIRECT_ADDRESSING: + assert BitTable.assertIsValid(arc, in); + assert arc.arcIdx() == -1 || BitTable.isBitSet(arc.arcIdx(), arc, in); + int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); + return readArcByDirectAddressing(arc, in, nextIndex, arc.presenceIndex + 1); + + default: + // Variable length arcs - linear search. + assert arc.bytesPerArc() == 0; + in.setPosition(arc.nextArc()); + arc.flags = in.readByte(); + } + return readArc(arc, in); + } + + /** + * Reads an arc.
+ * Precondition: The arc flags byte has already been read and set; the given BytesReader is + * positioned just after the arc flags byte. + */ + private PrimitiveLongArc readArc(PrimitiveLongArc arc, BytesReader in) throws IOException { + if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) { + arc.label = arc.firstLabel() + arc.arcIdx(); + } else { + arc.label = readLabel(in); + } + + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + arc.output = outputs.read(in); + } else { + arc.output = outputs.getNoOutput(); + } + + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + arc.nextFinalOutput = outputs.readFinalOutput(in); + } else { + arc.nextFinalOutput = outputs.getNoOutput(); + } + + if (arc.flag(BIT_STOP_NODE)) { + if (arc.flag(BIT_FINAL_ARC)) { + arc.target = FINAL_END_NODE; + } else { + arc.target = NON_FINAL_END_NODE; + } + arc.nextArc = in.getPosition(); // Only useful for list. + } else if (arc.flag(BIT_TARGET_NEXT)) { + arc.nextArc = in.getPosition(); // Only useful for list. + // TODO: would be nice to make this lazy -- maybe + // caller doesn't need the target and is scanning arcs... + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc() == 0) { + // must scan + seekToNextNode(in); + } else { + int numArcs = + arc.nodeFlags == ARCS_FOR_DIRECT_ADDRESSING + ? BitTable.countBits(arc, in) + : arc.numArcs(); + in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * (long) numArcs); + } + } + arc.target = in.getPosition(); + } else { + arc.target = readUnpackedNodeTarget(in); + arc.nextArc = in.getPosition(); // Only useful for list. + } + return arc; + } + + static PrimitiveLongArc readEndArc(PrimitiveLongArc follow, PrimitiveLongArc arc) { + if (follow.isFinal()) { + if (follow.target() <= 0) { + arc.flags = PrimitiveLongFST.BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.output = follow.nextFinalOutput(); + arc.label = PrimitiveLongFST.END_LABEL; + return arc; + } else { + return null; + } + } + + // TODO: could we somehow [partially] tableize arc lookups + // like automaton? + + /** + * Finds an arc leaving the incoming arc, replacing the arc in place. This returns null if the arc + * was not found, else the incoming arc. + */ + public PrimitiveLongArc findTargetArc( + int labelToMatch, PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in) + throws IOException { + + if (labelToMatch == END_LABEL) { + if (follow.isFinal()) { + if (follow.target() <= 0) { + arc.flags = BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.output = follow.nextFinalOutput(); + arc.label = END_LABEL; + arc.nodeFlags = arc.flags; + return arc; + } else { + return null; + } + } + + if (!targetHasArcs(follow)) { + return null; + } + + in.setPosition(follow.target()); + + // System.out.println("fta label=" + (char) labelToMatch); + + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + arc.numArcs = in.readVInt(); // This is in fact the label range. + arc.bytesPerArc = in.readVInt(); + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + + int arcIndex = labelToMatch - arc.firstLabel(); + if (arcIndex < 0 || arcIndex >= arc.numArcs()) { + return null; // Before or after label range. + } else if (!BitTable.isBitSet(arcIndex, arc, in)) { + return null; // Arc missing in the range. + } + return readArcByDirectAddressing(arc, in, arcIndex); + } else if (flags == ARCS_FOR_BINARY_SEARCH) { + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + arc.posArcsStart = in.getPosition(); + + // Array is sparse; do binary search: + int low = 0; + int high = arc.numArcs() - 1; + while (low <= high) { + // System.out.println(" cycle"); + int mid = (low + high) >>> 1; + // +1 to skip over flags + in.setPosition(arc.posArcsStart() - (arc.bytesPerArc() * mid + 1)); + int midLabel = readLabel(in); + final int cmp = midLabel - labelToMatch; + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + arc.arcIdx = mid - 1; + // System.out.println(" found!"); + return readNextRealArc(arc, in); + } + } + return null; + } else if (flags == ARCS_FOR_CONTINUOUS) { + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + int arcIndex = labelToMatch - arc.firstLabel(); + if (arcIndex < 0 || arcIndex >= arc.numArcs()) { + return null; // Before or after label range. + } + arc.arcIdx = arcIndex - 1; + return readNextRealArc(arc, in); + } + + // Linear scan + readFirstArcInfo(follow.target(), arc, in); + in.setPosition(arc.nextArc()); + while (true) { + assert arc.bytesPerArc() == 0; + flags = arc.flags = in.readByte(); + long pos = in.getPosition(); + int label = readLabel(in); + if (label == labelToMatch) { + in.setPosition(pos); + return readArc(arc, in); + } else if (label > labelToMatch) { + return null; + } else if (arc.isLast()) { + return null; + } else { + if (flag(flags, BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + if (flag(flags, BIT_STOP_NODE) == false && flag(flags, BIT_TARGET_NEXT) == false) { + readUnpackedNodeTarget(in); + } + } + } + } + + private void seekToNextNode(BytesReader in) throws IOException { + + while (true) { + + final int flags = in.readByte(); + readLabel(in); + + if (flag(flags, BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + + if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + + if (flag(flags, BIT_STOP_NODE) == false && flag(flags, BIT_TARGET_NEXT) == false) { + readUnpackedNodeTarget(in); + } + + if (flag(flags, BIT_LAST_ARC)) { + return; + } + } + } + + /** Returns a {@link BytesReader} for this FST, positioned at position 0. */ + public BytesReader getBytesReader() { + return fstReader.getReverseBytesReader(); + } + + /** Represent the FST metadata */ + public static final class PrimitiveLongFSTMetadata { + final INPUT_TYPE inputType; + final PrimitiveLongFSTOutputs outputs; + final int version; + // if non-null, this FST accepts the empty string and + // produces this output + Long emptyOutput; + long startNode; + long numBytes; + + public PrimitiveLongFSTMetadata( + INPUT_TYPE inputType, + PrimitiveLongFSTOutputs outputs, + Long emptyOutput, + long startNode, + int version, + long numBytes) { + this.inputType = inputType; + this.outputs = outputs; + this.emptyOutput = emptyOutput; + this.startNode = startNode; + this.version = version; + this.numBytes = numBytes; + } + } + + public static class PrimitiveLongFSTOutputs { + + private static final long NO_OUTPUT = 0L; + + private static final PrimitiveLongFSTOutputs singleton = new PrimitiveLongFSTOutputs(); + + private PrimitiveLongFSTOutputs() {} + + public static PrimitiveLongFSTOutputs getSingleton() { + return singleton; + } + + public long common(long output1, long output2) { + assert valid(output1); + assert valid(output2); + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } + } + + public long subtract(long output, long inc) { + assert valid(output); + assert valid(inc); + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output == inc) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + public long add(long prefix, long output) { + assert valid(prefix); + assert valid(output); + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } + + public void write(long output, DataOutput out) throws IOException { + assert valid(output); + out.writeVLong(output); + } + + public long read(DataInput in) throws IOException { + long v = in.readVLong(); + if (v == 0) { + return NO_OUTPUT; + } else { + return v; + } + } + + private boolean valid(long o) { + assert o == NO_OUTPUT || o > 0 : "o=" + o; + return true; + } + + public long getNoOutput() { + return NO_OUTPUT; + } + + public String outputToString(long output) { + return Long.toString(output); + } + + public String toString() { + return "PrimitiveLongFSTOutputs"; + } + + public long ramBytesUsed(Long output) { + return RamUsageEstimator.sizeOf(output); + } + + public void skipOutput(BytesReader in) throws IOException { + read(in); + } + + public void skipFinalOutput(BytesReader in) throws IOException { + read(in); + } + + public long readFinalOutput(BytesReader in) throws IOException { + return read(in); + } + + public void writeFinalOutput(long output, DataOutput out) throws IOException { + write(output, out); + } + } + + public static long get(PrimitiveLongFST primitiveLongFST, BytesRef input) throws IOException { + assert primitiveLongFST.metadata.inputType == PrimitiveLongFST.INPUT_TYPE.BYTE1; + + final BytesReader fstReader = primitiveLongFST.getBytesReader(); + + // TODO: would be nice not to alloc this on every lookup + final PrimitiveLongArc arc = primitiveLongFST.getFirstArc(new PrimitiveLongArc()); + + // Accumulate output as we go + long output = primitiveLongFST.outputs.getNoOutput(); + for (int i = 0; i < input.length; i++) { + if (primitiveLongFST.findTargetArc(input.bytes[i + input.offset] & 0xFF, arc, arc, fstReader) + == null) { + return -1; + } + output = primitiveLongFST.outputs.add(output, arc.output()); + } + + if (arc.isFinal()) { + return primitiveLongFST.outputs.add(output, arc.nextFinalOutput()); + } else { + return -1; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java new file mode 100644 index 000000000000..b2fa07b23617 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java @@ -0,0 +1,756 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import static org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc.BitTable; + +import java.io.IOException; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; + +/** + * Can next() and advance() through the terms in an {@link PrimitiveLongFST} + * + * @lucene.experimental + */ +abstract class PrimitiveLongFSTEnum { + protected final PrimitiveLongFST fst; + + protected PrimitiveLongArc[] arcs = new PrimitiveLongArc[10]; + + protected long[] output = new long[10]; + + protected final long NO_OUTPUT; + protected final FST.BytesReader fstReader; + + protected int upto; + int targetLength; + + /** + * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to + * the biggest term before target. + */ + PrimitiveLongFSTEnum(PrimitiveLongFST fst) { + this.fst = fst; + fstReader = fst.getBytesReader(); + NO_OUTPUT = fst.outputs.getNoOutput(); + fst.getFirstArc(getArc(0)); + output[0] = NO_OUTPUT; + } + + protected abstract int getTargetLabel(); + + protected abstract int getCurrentLabel(); + + protected abstract void setCurrentLabel(int label); + + protected abstract void grow(); + + /** Rewinds enum state to match the shared prefix between current term and target term */ + private void rewindPrefix() throws IOException { + if (upto == 0) { + // System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1), fstReader); + return; + } + // System.out.println(" rewind upto=" + upto + " vs targetLength=" + targetLength); + + final int currentLimit = upto; + upto = 1; + while (upto < currentLimit && upto <= targetLength + 1) { + final int cmp = getCurrentLabel() - getTargetLabel(); + if (cmp < 0) { + // seek forward + // System.out.println(" seek fwd"); + break; + } else if (cmp > 0) { + // seek backwards -- reset this arc to the first arc + final PrimitiveLongArc arc = getArc(upto); + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + // System.out.println(" seek first arc"); + break; + } + upto++; + } + // System.out.println(" fall through upto=" + upto); + } + + protected void doNext() throws IOException { + // System.out.println("FE: next upto=" + upto); + if (upto == 0) { + // System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1), fstReader); + } else { + // pop + // System.out.println(" check pop curArc target=" + arcs[upto].target + " label=" + + // arcs[upto].label + " isLast?=" + arcs[upto].isLast()); + while (arcs[upto].isLast()) { + upto--; + if (upto == 0) { + // System.out.println(" eof"); + return; + } + } + fst.readNextArc(arcs[upto], fstReader); + } + + pushFirst(); + } + + // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + + /** Seeks to smallest term that's >= target. */ + protected void doSeekCeil() throws IOException { + + // System.out.println(" advance len=" + target.length + " curlen=" + current.length); + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // System.out.println("FE.seekCeil upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + // System.out.println(" after rewind upto=" + upto); + + PrimitiveLongArc arc = getArc(upto); + // System.out.println(" init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while (arc != null) { + int targetLabel = getTargetLabel(); + // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) + // arc.label + ") vs targetLabel=" + targetLabel); + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + // Arcs are in an array + final FST.BytesReader in = fst.getBytesReader(); + if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in); + } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + arc = doSeekCeilArrayPacked(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + arc = doSeekCeilArrayContinuous(arc, targetLabel, in); + } + } else { + arc = doSeekCeilList(arc, targetLabel); + } + } + } + + private PrimitiveLongArc doSeekCeilArrayContinuous( + final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in) + throws IOException { + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex >= arc.numArcs()) { + rollbackToLastForkThenPush(); + return null; + } else { + if (targetIndex < 0) { + fst.readArcByContinuous(arc, in, 0); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } else { + fst.readArcByContinuous(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + } + } + + private PrimitiveLongArc doSeekCeilArrayDirectAddressing( + final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in) + throws IOException { + // The array is addressed directly by label, with presence bits to compute the actual arc + // offset. + + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex >= arc.numArcs()) { + rollbackToLastForkThenPush(); + return null; + } else { + if (targetIndex < 0) { + targetIndex = -1; + } else if (BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + // Not found, return the next arc (ceil). + int ceilIndex = BitTable.nextBitSet(targetIndex, arc, in); + assert ceilIndex != -1; + fst.readArcByDirectAddressing(arc, in, ceilIndex); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } + } + + private PrimitiveLongArc doSeekCeilArrayPacked( + final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in) + throws IOException { + // The array is packed -- use binary search to find the target. + int idx = Util.binarySearch(fst, arc, targetLabel); + if (idx >= 0) { + // Match + fst.readArcByIndex(arc, in, idx); + assert arc.arcIdx() == idx; + assert arc.label() == targetLabel + : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx; + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + // Dead end + fst.readArcByIndex(arc, in, idx - 1); + assert arc.isLast(); + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while (true) { + if (upto == 0) { + return null; + } + final PrimitiveLongArc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return null; + } + upto--; + } + } else { + // Ceiling - arc with least higher label + fst.readArcByIndex(arc, in, idx); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } + } + + private PrimitiveLongArc doSeekCeilList(final PrimitiveLongArc arc, final int targetLabel) + throws IOException { + // Arcs are not array'd -- must do linear scan: + if (arc.label() == targetLabel) { + // recurse + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (arc.label() > targetLabel) { + pushFirst(); + return null; + } else if (arc.isLast()) { + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while (true) { + if (upto == 0) { + return null; + } + final PrimitiveLongArc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return null; + } + upto--; + } + } else { + // keep scanning + // System.out.println(" next scan"); + fst.readNextArc(arc, fstReader); + } + return arc; + } + + // Todo: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + /** Seeks to largest term that's <= target. */ + void doSeekFloor() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + // System.out.println("FE: seek floor upto=" + upto); + + // Save CPU by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + // System.out.println("FE: after rewind upto=" + upto); + + PrimitiveLongArc arc = getArc(upto); + + // System.out.println("FE: init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while (arc != null) { + // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) + // arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + + // arc.bytesPerArc); + int targetLabel = getTargetLabel(); + + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + // Arcs are in an array + final FST.BytesReader in = fst.getBytesReader(); + if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in); + } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + arc = doSeekFloorArrayPacked(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + arc = doSeekFloorContinuous(arc, targetLabel, in); + } + } else { + arc = doSeekFloorList(arc, targetLabel); + } + } + } + + private PrimitiveLongArc doSeekFloorContinuous( + PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException { + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex < 0) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else if (targetIndex >= arc.numArcs()) { + // After last arc. + fst.readLastArcByContinuous(arc, in); + assert arc.label() < targetLabel; + assert arc.isLast(); + pushLast(); + return null; + } else { + // Within label range. + fst.readArcByContinuous(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + } + + private PrimitiveLongArc doSeekFloorArrayDirectAddressing( + PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException { + // The array is addressed directly by label, with presence bits to compute the actual arc + // offset. + + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex < 0) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else if (targetIndex >= arc.numArcs()) { + // After last arc. + fst.readLastArcByDirectAddressing(arc, in); + assert arc.label() < targetLabel; + assert arc.isLast(); + pushLast(); + return null; + } else { + // Within label range. + if (BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + // Scan backwards to find a floor arc. + int floorIndex = BitTable.previousBitSet(targetIndex, arc, in); + assert floorIndex != -1; + fst.readArcByDirectAddressing(arc, in, floorIndex); + assert arc.label() < targetLabel; + assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; + pushLast(); + return null; + } + } + + /** + * Target is beyond the last arc, out of label range. Dead end (target is after the last arc); + * rollback to last fork then push + */ + private void rollbackToLastForkThenPush() throws IOException { + upto--; + while (true) { + if (upto == 0) { + return; + } + final PrimitiveLongArc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return; + } + upto--; + } + } + + /** + * Backtracks until it finds a node which first arc is before our target label.` Then on the node, + * finds the arc just before the targetLabel. + * + * @return null to continue the seek floor recursion loop. + */ + private PrimitiveLongArc backtrackToFloorArc( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + while (true) { + // First, walk backwards until we find a node which first arc is before our target label. + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + if (arc.label() < targetLabel) { + // Then on this node, find the arc just before the targetLabel. + if (!arc.isLast()) { + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + findNextFloorArcBinarySearch(arc, targetLabel, in); + } else if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + findNextFloorArcDirectAddressing(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + findNextFloorArcContinuous(arc, targetLabel, in); + } + } else { + while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) { + fst.readNextArc(arc, fstReader); + } + } + } + assert arc.label() < targetLabel; + assert arc.isLast() || fst.readNextArcLabel(arc, in) >= targetLabel; + pushLast(); + return null; + } + upto--; + if (upto == 0) { + return null; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } + + /** + * Finds and reads an arc on the current node which label is strictly less than the given label. + * Skips the first arc, finds next floor arc; or none if the floor arc is the first arc itself (in + * this case it has already been read). + * + *

Precondition: the given arc is the first arc of the node. + */ + private void findNextFloorArcDirectAddressing( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING; + assert arc.label() != FST.END_LABEL; + assert arc.label() == arc.firstLabel(); + if (arc.numArcs() > 1) { + int targetIndex = targetLabel - arc.firstLabel(); + assert targetIndex >= 0; + if (targetIndex >= arc.numArcs()) { + // Beyond last arc. Take last arc. + fst.readLastArcByDirectAddressing(arc, in); + } else { + // Take the preceding arc, even if the target is present. + int floorIndex = BitTable.previousBitSet(targetIndex, arc, in); + if (floorIndex > 0) { + fst.readArcByDirectAddressing(arc, in, floorIndex); + } + } + } + } + + /** Same as {@link #findNextFloorArcDirectAddressing} for continuous node. */ + private void findNextFloorArcContinuous( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + assert arc.label() != FST.END_LABEL; + assert arc.label() == arc.firstLabel(); + if (arc.numArcs() > 1) { + int targetIndex = targetLabel - arc.firstLabel(); + assert targetIndex >= 0; + if (targetIndex >= arc.numArcs()) { + // Beyond last arc. Take last arc. + fst.readLastArcByContinuous(arc, in); + } else { + fst.readArcByContinuous(arc, in, targetIndex - 1); + } + } + } + + /** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */ + private void findNextFloorArcBinarySearch( + PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH; + assert arc.label() != FST.END_LABEL; + assert arc.arcIdx() == 0; + if (arc.numArcs() > 1) { + int idx = Util.binarySearch(fst, arc, targetLabel); + assert idx != -1; + if (idx > 1) { + fst.readArcByIndex(arc, in, idx - 1); + } else if (idx < -2) { + fst.readArcByIndex(arc, in, -2 - idx); + } + } + } + + private PrimitiveLongArc doSeekFloorArrayPacked( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + // Arcs are fixed array -- use binary search to find the target. + int idx = Util.binarySearch(fst, arc, targetLabel); + + if (idx >= 0) { + // Match -- recurse + // System.out.println(" match! arcIdx=" + idx); + fst.readArcByIndex(arc, in, idx); + assert arc.arcIdx() == idx; + assert arc.label() == targetLabel + : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx; + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (idx == -1) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else { + // There is a floor arc; idx will be (-1 - (floor + 1)). + fst.readArcByIndex(arc, in, -2 - idx); + assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; + assert arc.label() < targetLabel + : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel; + pushLast(); + return null; + } + } + + private PrimitiveLongArc doSeekFloorList(PrimitiveLongArc arc, int targetLabel) + throws IOException { + if (arc.label() == targetLabel) { + // Match -- recurse + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (arc.label() > targetLabel) { + // TODO: if each arc could somehow read the arc just + // before, we can save this re-scan. The ceil case + // doesn't need this because it reads the next arc + // instead: + while (true) { + // First, walk backwards until we find a first arc + // that's before our target label: + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + if (arc.label() < targetLabel) { + // Then, scan forwards to the arc just before + // the targetLabel: + while (!arc.isLast() && fst.readNextArcLabel(arc, fstReader) < targetLabel) { + fst.readNextArc(arc, fstReader); + } + pushLast(); + return null; + } + upto--; + if (upto == 0) { + return null; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } else if (!arc.isLast()) { + // System.out.println(" check next label=" + fst.readNextArcLabel(arc) + " (" + (char) + // fst.readNextArcLabel(arc) + ")"); + if (fst.readNextArcLabel(arc, fstReader) > targetLabel) { + pushLast(); + return null; + } else { + // keep scanning + return fst.readNextArc(arc, fstReader); + } + } else { + pushLast(); + return null; + } + } + + /** Seeks to exactly target term. */ + boolean doSeekExact() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // System.out.println("FE: seek exact upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + // System.out.println("FE: after rewind upto=" + upto); + PrimitiveLongArc arc = getArc(upto - 1); + int targetLabel = getTargetLabel(); + + final FST.BytesReader fstReader = fst.getBytesReader(); + + while (true) { + // System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); + final PrimitiveLongArc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader); + if (nextArc == null) { + // short circuit + // upto--; + // upto = 0; + fst.readFirstTargetArc(arc, getArc(upto), fstReader); + // System.out.println(" no match upto=" + upto); + return false; + } + // Match -- recurse: + output[upto] = fst.outputs.add(output[upto - 1], nextArc.output()); + if (targetLabel == FST.END_LABEL) { + // System.out.println(" return found; upto=" + upto + " output=" + output[upto] + " + // nextArc=" + nextArc.isLast()); + return true; + } + setCurrentLabel(targetLabel); + incr(); + targetLabel = getTargetLabel(); + arc = nextArc; + } + } + + private void incr() { + upto++; + grow(); + if (arcs.length <= upto) { + final PrimitiveLongArc[] newArcs = + new PrimitiveLongArc + [ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, newArcs, 0, arcs.length); + arcs = newArcs; + } + if (output.length <= upto) { + final long[] newOutput = + new long[ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(output, 0, newOutput, 0, output.length); + output = newOutput; + } + } + + // Appends current arc, and then recurses from its target, + // appending first arc all the way to the final node + private void pushFirst() throws IOException { + + PrimitiveLongArc arc = arcs[upto]; + assert arc != null; + + while (true) { + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (arc.label() == FST.END_LABEL) { + // Final node + break; + } + // System.out.println(" pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + + // fst.outputs.outputToString(output[upto])); + setCurrentLabel(arc.label()); + incr(); + + final PrimitiveLongArc nextArc = getArc(upto); + fst.readFirstTargetArc(arc, nextArc, fstReader); + arc = nextArc; + } + } + + // Recurses from current arc, appending last arc all the + // way to the first final node + private void pushLast() throws IOException { + + PrimitiveLongArc arc = arcs[upto]; + assert arc != null; + + while (true) { + setCurrentLabel(arc.label()); + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (arc.label() == FST.END_LABEL) { + // Final node + break; + } + incr(); + + arc = fst.readLastTargetArc(arc, getArc(upto), fstReader); + } + } + + private PrimitiveLongArc getArc(int idx) { + if (arcs[idx] == null) { + arcs[idx] = new PrimitiveLongArc(); + } + return arcs[idx]; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java new file mode 100644 index 000000000000..d7ca07581446 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.automaton.ByteRunnable; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.TransitionAccessor; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; + +/** + * Can next() through the terms defined by the intersection of a {@link PrimitiveLongFST} + * + *

and {@link org.apache.lucene.util.automaton.CompiledAutomaton}. + * + *

Note: this can only seek forward. + * + * @lucene.experimental + */ +public final class PrimitiveLongFSTIntersectEnum { + + private final PrimitiveLongFST fst; + + private final FST.BytesReader fstBytesReader; + + private final ByteRunnable byteRunnable; + + private final TransitionAccessor transitionAccessor; + + /** DFS traversal states */ + private int currentLevel; + + private Frame[] stack; + + private BytesRefBuilder term = new BytesRefBuilder(); + + private long fstOutput; + + boolean pending; + + boolean isEmptyValidOutput; + + public PrimitiveLongFSTIntersectEnum( + PrimitiveLongFST fst, CompiledAutomaton automaton, BytesRef startTerm) throws IOException { + this.fst = fst; + this.fstBytesReader = fst.getBytesReader(); + this.byteRunnable = automaton.getByteRunnable(); + this.transitionAccessor = automaton.getTransitionAccessor(); + this.stack = new Frame[16]; + + var firstFrame = new Frame(); + firstFrame.fstNode = new PrimitiveLongArc(); + fst.getFirstArc(firstFrame.fstNode); + firstFrame.fsaState = 0; + stack[0] = firstFrame; + + if (startTerm != null) { + seekToStartTerm(startTerm); + } else { + isEmptyValidOutput = isAccept(firstFrame.fstNode, firstFrame.fsaState); + } + } + + public boolean next() throws IOException { + if (isEmptyValidOutput) { + fstOutput = fst.getEmptyOutput(); + isEmptyValidOutput = false; + return true; + } + while (currentLevel >= 0) { + Frame currentFrame = stack[currentLevel]; + + if (!currentFrame.isFresh || hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) { + // current frame has candidates + if (findNextIntersection(currentFrame)) { + term.grow(currentLevel + 1); + term.setByteAt(currentLevel, (byte) currentFrame.fstCandidateNode.label()); + term.setLength(currentLevel + 1); + // early prune - only push a new frame when the candidate has descendants + if (hasDescendants(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) { + fillNextFrame(currentFrame); + } + // setup output + if (isAccept(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) { + fstOutput = + currentFrame.output // output before this node + + currentFrame.fstNode.output() // output of this node + // then output of the candidate + + currentFrame.fstCandidateNode.output() + + currentFrame.fstCandidateNode.nextFinalOutput(); + return true; + } + } else { + // no more intersection at this frame, pop frame + popFrame(); + } + } else { + // pop frame as the frame has no candidates + popFrame(); + } + } + return false; + } + + private void ensureStackCapacity() { + stack = ArrayUtil.grow(stack, currentLevel + 2); + } + + private void seekToStartTerm(BytesRef startTerm) throws IOException { + int length = startTerm.length; + + while (currentLevel < length) { + Frame currentFrame = stack[currentLevel]; + int target = startTerm.bytes[startTerm.offset + currentLevel] & 0xff; + + if (currentFrame.numTransitions > 0 + || hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) { + initArcAndTransition(currentFrame, false); + currentFrame.isFresh = false; + fstAdvanceCeil(target, currentFrame.fstCandidateNode); + fsaAdvanceCeil(currentFrame, target); + + if (currentFrame.fstCandidateNode.label() == target + && (currentFrame.fsaTransition.min <= target + && target <= currentFrame.fsaTransition.max)) { + term.append((byte) target); + fillNextFrame(currentFrame); + continue; + } + + if (currentFrame.fstCandidateNode.label() > target + || currentFrame.fsaTransition.min > target) { + pending = true; + } + break; + } else { + // all prefix upto this level is match, but the term to seek is longer + break; + } + } + } + + private void fillNextFrame(Frame currentFrame) { + ensureStackCapacity(); + Frame nextFrame; + // reuse previous allocations + if (stack[currentLevel + 1] == null) { + nextFrame = new Frame(); + } else { + nextFrame = stack[currentLevel + 1]; + nextFrame.numTransitions = 0; + nextFrame.isFresh = true; + } + nextFrame.fstNode = currentFrame.fstCandidateNode; + nextFrame.fsaState = currentFrame.fsaTransition.dest; + nextFrame.output = currentFrame.output + currentFrame.fstNode.output(); + stack[++currentLevel] = nextFrame; + } + + private void popFrame() { + currentLevel--; + term.setLength(currentLevel); + } + + private boolean isAccept(PrimitiveLongArc fstNode, int fsaState) { + return byteRunnable.isAccept(fsaState) && fstNode.isFinal(); + } + + private boolean hasDescendants(PrimitiveLongArc fstNode, int fsaState) { + return transitionAccessor.getNumTransitions(fsaState) > 0 + && PrimitiveLongFST.targetHasArcs(fstNode); + } + + private void initArcAndTransition(Frame frame, boolean advanceToFirstTransition) + throws IOException { + fst.readFirstRealTargetArc(frame.fstNode.target(), frame.fstCandidateNode, fstBytesReader); + frame.numTransitions = transitionAccessor.initTransition(frame.fsaState, frame.fsaTransition); + frame.transitionUpto = 0; + if (advanceToFirstTransition) { + transitionAccessor.getNextTransition(frame.fsaTransition); + frame.transitionUpto++; + } + } + + private boolean findNextIntersection(Frame frame) throws IOException { + if (frame.isFresh) { + // when called first time, init first FST arc and the FSA transition + initArcAndTransition(frame, true); + frame.isFresh = false; + } else if (pending) { + pending = false; + } else { + // subsequent call, which implies we previously found an intersection. + // we need to advance the FST to avoid returning the same state. + // Advance FST not the FSA because FST arc has a single label, + // where FSA transition may accept a range of lables + if (frame.fstCandidateNode.isLast()) { + return false; + } + frame.fstCandidateNode = fst.readNextRealArc(frame.fstCandidateNode, fstBytesReader); + } + + while (true) { + if (frame.fstCandidateNode.label() < frame.fsaTransition.min) { + // advance FST + if (frame.fstCandidateNode.isLast()) { + // no more eligible FST arc at this level + return false; + } + // TODO: advance to first arc that has label >= fsaTransition.min + // frame.fstCandidateNode = + // fst.readNextRealArc(frame.fstCandidateNode, fstBytesReader); + if (fstAdvanceCeil(frame.fsaTransition.min, frame.fstCandidateNode) == false) { + return false; + } + } else if (frame.fstCandidateNode.label() > frame.fsaTransition.max) { + // advance FSA + if (frame.transitionUpto == frame.numTransitions) { + // no more eligible FSA transitions at this level + return false; + } + // TODO: advance FSA with binary search to fstNode.label() + // transitionAccessor.getNextTransition(frame.fsaTransition); + // frame.transitionUpto++; + fsaAdvanceCeil(frame, frame.fstCandidateNode.label()); + } else { + // can go deeper + return true; + } + } + } + + public BytesRef getTerm() { + return term.get(); + } + + public long getFSTOutput() { + return fstOutput; + } + + /** + * Advance to the arc whose label is greater or equal to the provided target. + * + * @return true, if found. + */ + private boolean fstAdvanceCeil(int target, PrimitiveLongArc /* mutates */ arc) + throws IOException { + if (arc.bytesPerArc() != 0 && arc.label() != PrimitiveLongFST.END_LABEL) { + if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_CONTINUOUS) { + int targetIndex = target - arc.label() + arc.arcIdx(); + if (targetIndex < 0) { + return false; + } else if (targetIndex >= arc.numArcs()) { + fst.readArcByContinuous(arc, fstBytesReader, arc.numArcs() - 1); + return false; + } else { + fst.readArcByContinuous(arc, fstBytesReader, targetIndex); + return true; + } + } else if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_DIRECT_ADDRESSING) { + // Fixed length arcs in a direct addressing node. + int targetIndex = target - arc.label() + arc.arcIdx(); + if (targetIndex >= arc.numArcs() || targetIndex < 0) { + return false; + } else if (targetIndex >= arc.numArcs()) { + fst.readArcByDirectAddressing(arc, fstBytesReader, arc.numArcs() - 1); + return false; + } else { + if (PrimitiveLongArc.BitTable.isBitSet(targetIndex, arc, fstBytesReader)) { + fst.readArcByDirectAddressing(arc, fstBytesReader, targetIndex); + } else { + int ceilIndex = PrimitiveLongArc.BitTable.nextBitSet(targetIndex, arc, fstBytesReader); + if (ceilIndex == -1) { + return false; + } + fst.readArcByDirectAddressing(arc, fstBytesReader, ceilIndex); + } + return true; + } + } + // Fixed length arcs in a binary search node. + int idx = Util.binarySearch(fst, arc, target); + if (idx >= 0) { + fst.readArcByIndex(arc, fstBytesReader, idx); + return true; + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + fst.readArcByIndex(arc, fstBytesReader, arc.numArcs() - 1); + // DEAD END! + return false; + } + fst.readArcByIndex(arc, fstBytesReader, idx); + return true; + } + + // Variable length arcs in a linear scan list, + // or special arc with label == FST.END_LABEL. + while (true) { + if (arc.label() >= target) { + return true; + } else if (arc.isLast()) { + return false; + } else { + fst.readNextRealArc(arc, fstBytesReader); + } + } + } + + private void fsaAdvanceCeil(Frame frame, int target) { + int low = frame.transitionUpto; + int high = frame.numTransitions; + Transition t = frame.fsaTransition; + + // invariant: target is between the min of [low, high) + int mid = 0; + while (high - low > 1) { + mid = (high + low) >>> 1; + transitionAccessor.getTransition(frame.fsaState, mid, t); + if (t.min > target) { + high = mid; + } else if (t.min < target) { + low = mid; + } else { + frame.transitionUpto = mid + 1; + return; + } + } + transitionAccessor.getTransition(frame.fsaState, low, t); + frame.transitionUpto = low + 1; + } + + private boolean fsaAdvanceCeilSlow(Frame frame, int target) { + while (frame.transitionUpto < frame.numTransitions) { + transitionAccessor.getNextTransition(frame.fsaTransition); + frame.transitionUpto++; + if (target <= frame.fsaTransition.max) { + return frame.fsaTransition.min <= target; + } + } + return false; + } + + /** + * We will maintain the state of conventional recursive DFS traversal algorithm, which is stack of + * frames. This class capture the state at each level. + */ + static final class Frame { + PrimitiveLongArc fstNode; + + PrimitiveLongArc fstCandidateNode = new PrimitiveLongArc(); + + int fsaState; + + long output; + + Transition fsaTransition = new Transition(); + + int transitionUpto; + + int numTransitions; + + boolean isFresh = true; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 9fdc460d0583..31c267234e69 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -32,6 +32,7 @@ import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; /** * Static helper methods. @@ -896,6 +897,88 @@ public static Arc readCeilArc( } } + /** + * TODO: can we work around this??? + * + *

Same as {@link Util#readCeilArc(int, FST, Arc, Arc, BytesReader)} but for {@link + * PrimitiveLongFST} + */ + public static PrimitiveLongArc readCeilArc( + int label, + PrimitiveLongFST fst, + PrimitiveLongArc follow, + PrimitiveLongArc arc, + BytesReader in) + throws IOException { + if (label == PrimitiveLongFST.END_LABEL) { + return PrimitiveLongFST.readEndArc(follow, arc); + } + if (!PrimitiveLongFST.targetHasArcs(follow)) { + return null; + } + fst.readFirstTargetArc(follow, arc, in); + if (arc.bytesPerArc() != 0 && arc.label() != PrimitiveLongFST.END_LABEL) { + if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_DIRECT_ADDRESSING) { + // Fixed length arcs in a direct addressing node. + int targetIndex = label - arc.label(); + if (targetIndex >= arc.numArcs()) { + return null; + } else if (targetIndex < 0) { + return arc; + } else { + if (PrimitiveLongArc.BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == label; + } else { + int ceilIndex = PrimitiveLongArc.BitTable.nextBitSet(targetIndex, arc, in); + assert ceilIndex != -1; + fst.readArcByDirectAddressing(arc, in, ceilIndex); + assert arc.label() > label; + } + return arc; + } + } else if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_CONTINUOUS) { + int targetIndex = label - arc.label(); + if (targetIndex >= arc.numArcs()) { + return null; + } else if (targetIndex < 0) { + return arc; + } else { + fst.readArcByContinuous(arc, in, targetIndex); + assert arc.label() == label; + return arc; + } + } + // Fixed length arcs in a binary search node. + int idx = binarySearch(fst, arc, label); + if (idx >= 0) { + return fst.readArcByIndex(arc, in, idx); + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + // DEAD END! + return null; + } + return fst.readArcByIndex(arc, in, idx); + } + + // Variable length arcs in a linear scan list, + // or special arc with label == FST.END_LABEL. + fst.readFirstRealTargetArc(follow.target(), arc, in); + + while (true) { + // System.out.println(" non-bs cycle"); + if (arc.label() >= label) { + // System.out.println(" found!"); + return arc; + } else if (arc.isLast()) { + return null; + } else { + fst.readNextRealArc(arc, in); + } + } + } + /** * Perform a binary search of Arcs encoded as a packed array * @@ -934,4 +1017,32 @@ static int binarySearch(FST fst, FST.Arc arc, int targetLabel) throws } return -1 - low; } + + /** Same as {@link Util#binarySearch(FST, Arc, int)} but for {@link PrimitiveLongFST} */ + static int binarySearch(PrimitiveLongFST fst, PrimitiveLongArc arc, int targetLabel) + throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH + : "Arc is not encoded as packed array for binary search (nodeFlags=" + + arc.nodeFlags() + + ")"; + BytesReader in = fst.getBytesReader(); + int low = arc.arcIdx(); + int mid; + int high = arc.numArcs() - 1; + while (low <= high) { + mid = (low + high) >>> 1; + in.setPosition(arc.posArcsStart()); + in.skipBytes((long) arc.bytesPerArc() * mid + 1); + final int midLabel = fst.readLabel(in); + final int cmp = midLabel - targetLabel; + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + return -1 - low; + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java index 7c72b3d2e76a..97454969be90 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java @@ -80,7 +80,7 @@ public void test() throws Exception { // System.out.println("query clone count=" + queryCloneCount); assertTrue( "too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount, - queryCloneCount < 50); + queryCloneCount < 100); r.close(); dir.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java new file mode 100644 index 000000000000..a07e7bfae5e6 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.fst; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunnable; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.TransitionAccessor; + +public class TestPrimitiveLongFSTIntersectEnum extends LuceneTestCase { + + public void testBasics() throws IOException { + String[] testTerms = { + "!", "*", "+", "++", "+++b", "++c", "a", "b", "bb", "dd", + }; + + HashMap termOutputs = new HashMap<>(); + + IntsRefBuilder scratchInts = new IntsRefBuilder(); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build(); + + for (var term : testTerms) { + long output = random().nextLong(1, 1024); + termOutputs.put(term, output); + fstCompiler.add(Util.toIntsRef(new BytesRef(term), scratchInts), output); + // System.out.println(term + ": " + output); + } + + var boxedFst = fstCompiler.compile(); + + byte[] metaBytes = new byte[4096]; + byte[] dataBytes = new byte[4096]; + DataOutput metaOut = new ByteArrayDataOutput(metaBytes); + DataOutput dataOutput = new ByteArrayDataOutput(dataBytes); + + boxedFst.save(metaOut, dataOutput); + + PrimitiveLongFST primitiveLongFst = + new PrimitiveLongFST( + PrimitiveLongFST.readMetadata( + new ByteArrayDataInput(metaBytes), + PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()), + new ByteArrayDataInput(dataBytes)); + + // RegExp regExp = new RegExp("a([a-f]|[j-z])c", RegExp.NONE); + RegExp regExp = new RegExp("+*.", RegExp.NONE); + Automaton a = regExp.toAutomaton(); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(a); + + var byteRunnable = compiledAutomaton.getByteRunnable(); + var transitionAccessor = compiledAutomaton.getTransitionAccessor(); + // dfsAutomaton(byteRunnable, transitionAccessor, 0, ""); + + PrimitiveLongFST.PrimitiveLongArc firstArc = new PrimitiveLongFST.PrimitiveLongArc(); + System.out.println("---- recursive algo ----"); + dfsIntersectFsaFst( + primitiveLongFst, + primitiveLongFst.getBytesReader(), + primitiveLongFst.getFirstArc(firstArc), + "", + 0, + byteRunnable, + transitionAccessor, + 0); + + System.out.println("---- non-recursive algo ----"); + var intersectEnum = + new PrimitiveLongFSTIntersectEnum(primitiveLongFst, compiledAutomaton, null); + while (intersectEnum.next()) { + String term = intersectEnum.getTerm().utf8ToString(); + long actualOutput = intersectEnum.getFSTOutput(); + System.out.println( + term + " expected output:" + termOutputs.get(term) + " actual: " + actualOutput); + } + } + + void dfs( + PrimitiveLongFST fst, + FST.BytesReader in, + PrimitiveLongFST.PrimitiveLongArc currentLevelNode, + String path, + long acc) + throws IOException { + if (currentLevelNode.isFinal()) { + long output = acc + currentLevelNode.output() + currentLevelNode.nextFinalOutput(); + System.out.println(path + (char) currentLevelNode.label() + "raw output: " + output); + } + + if (PrimitiveLongFST.targetHasArcs(currentLevelNode)) { + String pathNext = + currentLevelNode.label() > 0 ? path + (char) currentLevelNode.label() : path; + long accNext = currentLevelNode.label() > 0 ? acc + currentLevelNode.output() : acc; + var nextLevelNode = new PrimitiveLongFST.PrimitiveLongArc(); + fst.readFirstRealTargetArc(currentLevelNode.target(), nextLevelNode, in); + dfs(fst, in, nextLevelNode, pathNext, accNext); + } + + if (currentLevelNode.isLast() == false) { + fst.readNextRealArc(currentLevelNode, in); + dfs(fst, in, currentLevelNode, path, acc); + } + } + + public void testAutomaton() { + RegExp regExp = new RegExp("+*.", RegExp.NONE); + Automaton a = regExp.toAutomaton(); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(a); + System.out.println("isFinite: " + compiledAutomaton.finite); + + var byteRunnable = compiledAutomaton.getByteRunnable(); + var transitionAccessor = compiledAutomaton.getTransitionAccessor(); + // dfsAutomaton(byteRunnable, transitionAccessor, 0, ""); + // dumpTransitionsViaNext(byteRunnable, transitionAccessor, 0, new HashSet<>()); + dumpTransitionsViaRA(byteRunnable, transitionAccessor, 0, new HashSet<>()); + } + + void dfsAutomaton( + ByteRunnable a, TransitionAccessor transitionAccessor, int currentLevelState, String path) { + if (a.isAccept(currentLevelState)) { + if (path.length() > 50) { + throw new RuntimeException(); + } + System.out.println("found: " + path); + } + + int currentLevelSize = transitionAccessor.getNumTransitions(currentLevelState); + for (int i = 0; i < currentLevelSize; i++) { + Transition t = new Transition(); + transitionAccessor.getNextTransition(t); + System.out.println( + "At: src: " + + t.source + + " [" + + t.min + + ", " + + t.max + + "] " + + "dest: " + + t.dest + + " is dest accept: " + + (a.isAccept(t.dest) ? "yes" : "no")); + for (int label = t.min; label <= t.max; label++) { + dfsAutomaton(a, transitionAccessor, t.dest, path + " " + label); + } + } + } + + void dumpTransitionsViaNext( + ByteRunnable a, + TransitionAccessor transitionAccessor, + int currentState, + Set seenStates) { + if (seenStates.contains(currentState)) { + return; + } + + seenStates.add(currentState); + + var t = new Transition(); + var numStates = transitionAccessor.initTransition(currentState, t); + + for (int i = 0; i < numStates; i++) { + transitionAccessor.getNextTransition(t); + System.out.println( + "At: src: " + + t.source + + " arcIdx: " + + i + + " [" + + t.min + + ", " + + t.max + + "] " + + "dest: " + + t.dest + + " is dest accept: " + + (a.isAccept(t.dest) ? "yes" : "no")); + dumpTransitionsViaNext(a, transitionAccessor, t.dest, seenStates); + } + } + + void dumpTransitionsViaRA( + ByteRunnable a, + TransitionAccessor transitionAccessor, + int currentState, + Set seenStates) { + if (seenStates.contains(currentState)) { + return; + } + + seenStates.add(currentState); + + var t = new Transition(); + var numStates = transitionAccessor.initTransition(currentState, t); + + // transitionAccessor.getTransition(currentState, numStates - 1, t); + for (int i = 0; i < numStates; i++) { + transitionAccessor.getTransition(currentState, i, t); + System.out.println( + "At: src: " + + t.source + + " arcIdx: " + + i + + " [" + + t.min + + ", " + + t.max + + "] " + + "dest: " + + t.dest + + " is dest accept: " + + (a.isAccept(t.dest) ? "yes" : "no")); + dumpTransitionsViaRA(a, transitionAccessor, t.dest, seenStates); + } + } + + void dfsIntersectFsaFst( + PrimitiveLongFST fst, + FST.BytesReader in, + PrimitiveLongFST.PrimitiveLongArc fstNode, + String path, + long acc, + ByteRunnable a, + TransitionAccessor transitionAccessor, + int fsaState) + throws IOException { + + if (a.isAccept(fsaState) && fstNode.isFinal()) { + // found + System.out.println(path + ": " + (acc + fstNode.output() + fstNode.nextFinalOutput())); + } + + Transition fsaTransition = new Transition(); + int numTransitions = transitionAccessor.initTransition(fsaState, fsaTransition); + + if (numTransitions <= 0 || !PrimitiveLongFST.targetHasArcs(fstNode)) { + return; + } + + int transitionUpto = 0; + var nextLevelFstNode = new PrimitiveLongFST.PrimitiveLongArc(); + fst.readFirstRealTargetArc(fstNode.target(), nextLevelFstNode, in); + transitionAccessor.getNextTransition(fsaTransition); + transitionUpto++; + + while (true) { + if (nextLevelFstNode.label() < fsaTransition.min) { + // advance FST + if (nextLevelFstNode.isLast()) { + // no more eligible FST arc at this level + break; + } + // TODO: advance to first arc that has label >= fsaTransition.min + nextLevelFstNode = fst.readNextRealArc(nextLevelFstNode, in); + } else if (nextLevelFstNode.label() > fsaTransition.max) { + // advance FSA + if (transitionUpto == numTransitions) { + // no more eligible FSA transitions at this level + return; + } + // TODO: advance FSA with binary search to fstNode.label() + transitionAccessor.getNextTransition(fsaTransition); + transitionUpto++; + } else { + // can go deeper + String pathNext = path + (char) nextLevelFstNode.label(); + long accNext = acc + fstNode.output(); + int nextFsaState = fsaTransition.dest; + dfsIntersectFsaFst( + fst, in, nextLevelFstNode, pathNext, accNext, a, transitionAccessor, nextFsaState); + if (nextLevelFstNode.isLast()) { + // no more candidate at this prefix + return; + } else { + // TODO: advance to first arc that has label >= fsaTransition.min + nextLevelFstNode = fst.readNextRealArc(nextLevelFstNode, in); + } + } + } + } +} diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java index 420d6d40d6de..21ba55fd08ab 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java @@ -140,7 +140,8 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw throw new IllegalArgumentException( "slice() " + sliceDescription + " out of bounds: " + this); } - return new RAFIndexInput(sliceDescription, file, off + offset, length, getBufferSize()); + String description = sliceDescription == null ? toString() : sliceDescription; + return new RAFIndexInput(description, file, off + offset, length, getBufferSize()); } @Override