From af115b95744d8b0e4cef3c3c82393aa58ffe15cd Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Tue, 19 Sep 2023 11:41:28 -0700 Subject: [PATCH 01/57] Setup no-op Lucene90RandomAcessDictionaryPostingsFormat --- lucene/core/src/java/module-info.java | 1 + ...90RandomAcessDictionaryPostingsFormat.java | 81 +++++++++++++++++++ .../Lucene90RandomAccessTermsReader.java | 46 +++++++++++ .../Lucene90RandomAccessTermsWriter.java | 30 +++++++ 4 files changed, 158 insertions(+) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index f5d8cd275b79..bf67d687cfb9 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -35,6 +35,7 @@ exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene90.blocktree; exports org.apache.lucene.codecs.lucene90.compressing; + exports org.apache.lucene.codecs.lucene90.radomaccess; exports org.apache.lucene.codecs.perfield; exports org.apache.lucene.document; exports org.apache.lucene.geo; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java new file mode 100644 index 000000000000..4b70dba02d96 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsReader; +import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** + * Similar to {@link Lucene90PostingsFormat} but with a different term dictionary implementation. + * + * @lucene.experimental + */ +public final class Lucene90RandomAcessDictionaryPostingsFormat extends PostingsFormat { + + // Increment version to change it + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Creates {@code Lucene90RandomAcessDictionaryPostingsFormat} */ + public Lucene90RandomAcessDictionaryPostingsFormat() { + super("Lucene90RandomAccess"); + } + + @Override + public String toString() { + return getName(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene90PostingsWriter(state); + boolean success = false; + try { + FieldsConsumer ret = new Lucene90RandomAccessTermsWriter(); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene90PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new Lucene90RandomAccessTermsReader(); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java new file mode 100644 index 000000000000..7294f1ea09a9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90.radomaccess; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.index.Terms; + +public class Lucene90RandomAccessTermsReader extends FieldsProducer { + @Override + public void close() throws IOException {} + + @Override + public void checkIntegrity() throws IOException {} + + @Override + public Iterator iterator() { + return null; + } + + @Override + public Terms terms(String field) throws IOException { + return null; + } + + @Override + public int size() { + return 0; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java new file mode 100644 index 000000000000..4fe720907b38 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.radomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.index.Fields; + +public class Lucene90RandomAccessTermsWriter extends FieldsConsumer { + @Override + public void write(Fields fields, NormsProducer norms) throws IOException {} + + @Override + public void close() throws IOException {} +} From 88afec30606c13616f2377a7201b7dce125af1c8 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 16 Oct 2023 14:10:05 -0700 Subject: [PATCH 02/57] Rename Lucene90RandomAcessDictionaryPostingsFormat to Lucene90RandomAccessDictionaryPostingsFormat --- .../Lucene90RandomAccessDictionaryPostingsFormat.java} | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) rename lucene/core/src/java/org/apache/lucene/codecs/lucene90/{Lucene90RandomAcessDictionaryPostingsFormat.java => radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java} (91%) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java similarity index 91% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java index 4b70dba02d96..2918fb8d367d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90; +package org.apache.lucene.codecs.lucene90.radomaccess; import java.io.IOException; import org.apache.lucene.codecs.FieldsConsumer; @@ -22,6 +22,9 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsReader; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter; import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsReader; import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsWriter; import org.apache.lucene.index.SegmentReadState; From d16c5012db5353c42741b4f562c402ed37d2f6c7 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 16 Oct 2023 14:16:11 -0700 Subject: [PATCH 03/57] restrict class visibility --- .../Lucene90RandomAccessDictionaryPostingsFormat.java | 8 +++----- .../radomaccess/Lucene90RandomAccessTermsReader.java | 2 +- .../radomaccess/Lucene90RandomAccessTermsWriter.java | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java index 2918fb8d367d..7d770ceb7f26 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java @@ -25,8 +25,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsReader; import org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter; -import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsReader; -import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; @@ -36,14 +34,14 @@ * * @lucene.experimental */ -public final class Lucene90RandomAcessDictionaryPostingsFormat extends PostingsFormat { +public final class Lucene90RandomAccessDictionaryPostingsFormat extends PostingsFormat { // Increment version to change it static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - /** Creates {@code Lucene90RandomAcessDictionaryPostingsFormat} */ - public Lucene90RandomAcessDictionaryPostingsFormat() { + /** Creates {@code Lucene90RandomAccessDictionaryPostingsFormat} */ + public Lucene90RandomAccessDictionaryPostingsFormat() { super("Lucene90RandomAccess"); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java index 7294f1ea09a9..7fa08663baba 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java @@ -22,7 +22,7 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.index.Terms; -public class Lucene90RandomAccessTermsReader extends FieldsProducer { +class Lucene90RandomAccessTermsReader extends FieldsProducer { @Override public void close() throws IOException {} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java index 4fe720907b38..19bdf35845ee 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java @@ -21,7 +21,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.index.Fields; -public class Lucene90RandomAccessTermsWriter extends FieldsConsumer { +class Lucene90RandomAccessTermsWriter extends FieldsConsumer { @Override public void write(Fields fields, NormsProducer norms) throws IOException {} From 3299fe065a50ae876bbb1e4dcdcaceabdbb27ac6 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 16 Oct 2023 15:08:07 -0700 Subject: [PATCH 04/57] Support per-type term index based on FST --- .../codecs/lucene90/radomaccess/TermType.java | 88 +++++++++++++++++++ .../lucene90/radomaccess/TermsIndex.java | 24 +++++ .../radomaccess/TermsIndexBuilder.java | 66 ++++++++++++++ .../radomaccess/TestTermsIndexBuilder.java | 77 ++++++++++++++++ 4 files changed, 255 insertions(+) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java new file mode 100644 index 000000000000..7dae21a94dd5 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90.radomaccess; + +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; + +import java.util.Objects; + +class TermType { + private static final byte SINGLETON_DOC_MASK = (byte) 1; + + private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1; + + private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2; + + public static final int NUM_TOTAL_TYPES = 8; + + private final byte flag; + + private TermType(byte flag) { + this.flag = flag; + } + + int getId() { + assert this.flag >= 0 && this.flag <=8; + return this.flag; + } + + boolean hasSingletonDoc() { + return (this.flag & SINGLETON_DOC_MASK) > 0; + } + + boolean hasSkipData() { + return (this.flag & HAS_SKIP_DATA_MASK) > 0; + } + + boolean hasVintPositionBlock() { + return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0; + } + + + static TermType fromTermState(IntBlockTermState state) { + byte flag = 0; + if (state.singletonDocID != -1) { + flag |= SINGLETON_DOC_MASK; + } + if (state.skipOffset != -1) { + flag |= HAS_SKIP_DATA_MASK; + } + if (state.lastPosBlockOffset != -1) { + flag |= HAS_VINT_POSITION_BLOCK_MASK; + } + return new TermType(flag); + } + + static TermType fromId(int id) { + if (id < 0 || id > 8) { + throw new IllegalArgumentException("id must be within range [0, 8]"); + } + return new TermType((byte) id); + } + + @Override + public int hashCode() { + return Objects.hashCode(this.flag); + } + + @Override + public boolean equals(Object that) { + return that instanceof TermType && + ((TermType) that).flag == this.flag; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java new file mode 100644 index 000000000000..27de75bf10d6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90.radomaccess; + +import org.apache.lucene.util.fst.FST; + + +record TermsIndex(FST fst) { +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java new file mode 100644 index 000000000000..ce7fe87207ce --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90.radomaccess; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Builds a term index for a given field. Logically this is a map: term -> (type, ord) where the ordinals + * are scoped to type (not global). + */ +final class TermsIndexBuilder { + private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; + private final FSTCompiler fstCompiler = + new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); + + TermsIndexBuilder() { + Arrays.fill(countPerType, -1); + } + + public void addTerm(BytesRef term, TermType termType) throws IOException { + IntsRefBuilder scratchInts = new IntsRefBuilder(); + long ord = ++countPerType[termType.getId()]; + fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType)); + } + + public TermsIndex build() throws IOException { + return new TermsIndex(fstCompiler.compile()); + } + + private long encode(long ord, TermType termType) { + // use a single long to encode `ord` and `termType` + // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0` + // so it looks like this |... ord ...| termType| ... hasOutput ...| + // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord + if ( ord < 0) { + throw new IllegalArgumentException("can't encode negative ord"); + } + if ( ord > ((1L << 60) - 1) ) { + throw new IllegalArgumentException("Input ord is too large"); + } + return (ord << 4) | ((long) termType.getId() << 1) | 1L; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java new file mode 100644 index 000000000000..b8fd67ac64cc --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90.radomaccess; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class TestTermsIndexBuilder extends LuceneTestCase { + + public void testBasics() throws IOException { + String[] test_terms = { + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + }; + + Map termsToType = new HashMap<>(); + Map termsToOrd = new HashMap<>(); + Map typeCounters = new HashMap<>(); + + for (String term : test_terms) { + int termType = random().nextInt(TermType.NUM_TOTAL_TYPES); + termsToType.put(term, termType); + int ord = typeCounters.getOrDefault(termType, -1) + 1; + typeCounters.put(termType, ord); + termsToOrd.put(term, ord); + } + + TermsIndexBuilder builder = new TermsIndexBuilder(); + for (String term : test_terms) { + BytesRef termBytes = new BytesRef(term); + builder.addTerm(termBytes, TermType.fromId(termsToType.get(term))); + } + TermsIndex termsIndex = builder.build(); + + FST fst = termsIndex.fst(); + + for (String term : test_terms) { + BytesRef termBytes = new BytesRef(term); + long encoded = Util.get(fst, termBytes); + + assertEquals(1L, encoded & 0b1L); + assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1); + assertEquals((long) termsToOrd.get(term), encoded >> 4); + } + + } + +} \ No newline at end of file From 137d5d367df46f33e076c918e3296216ffbc49f7 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 26 Oct 2023 14:03:45 -0700 Subject: [PATCH 05/57] Move the code to be under sandbox --- lucene/core/src/java/module-info.java | 1 - .../codecs/lucene90/radomaccess/TermType.java | 88 ------------------ .../radomaccess/TermsIndexBuilder.java | 66 -------------- .../radomaccess/TestTermsIndexBuilder.java | 77 ---------------- lucene/sandbox/src/java/module-info.java | 1 + ...0RandomAccessDictionaryPostingsFormat.java | 2 +- .../Lucene90RandomAccessTermsReader.java | 2 +- .../Lucene90RandomAccessTermsWriter.java | 2 +- .../lucene90/randomaccess/TermType.java | 91 +++++++++++++++++++ .../lucene90/randomaccess}/TermsIndex.java | 6 +- .../randomaccess/TermsIndexBuilder.java | 70 ++++++++++++++ .../lucene90/randomaccess/package-info.java | 22 +++++ .../randomaccess/TestTermsIndexBuilder.java | 65 +++++++++++++ 13 files changed, 254 insertions(+), 239 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java delete mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java delete mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/Lucene90RandomAccessDictionaryPostingsFormat.java (97%) rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/Lucene90RandomAccessTermsReader.java (95%) rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/Lucene90RandomAccessTermsWriter.java (94%) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/TermsIndex.java (89%) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index 3b1f27ff6160..c728be820999 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -35,7 +35,6 @@ exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene90.blocktree; exports org.apache.lucene.codecs.lucene90.compressing; - exports org.apache.lucene.codecs.lucene90.radomaccess; exports org.apache.lucene.codecs.perfield; exports org.apache.lucene.document; exports org.apache.lucene.geo; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java deleted file mode 100644 index 7dae21a94dd5..000000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.codecs.lucene90.radomaccess; - -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; - -import java.util.Objects; - -class TermType { - private static final byte SINGLETON_DOC_MASK = (byte) 1; - - private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1; - - private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2; - - public static final int NUM_TOTAL_TYPES = 8; - - private final byte flag; - - private TermType(byte flag) { - this.flag = flag; - } - - int getId() { - assert this.flag >= 0 && this.flag <=8; - return this.flag; - } - - boolean hasSingletonDoc() { - return (this.flag & SINGLETON_DOC_MASK) > 0; - } - - boolean hasSkipData() { - return (this.flag & HAS_SKIP_DATA_MASK) > 0; - } - - boolean hasVintPositionBlock() { - return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0; - } - - - static TermType fromTermState(IntBlockTermState state) { - byte flag = 0; - if (state.singletonDocID != -1) { - flag |= SINGLETON_DOC_MASK; - } - if (state.skipOffset != -1) { - flag |= HAS_SKIP_DATA_MASK; - } - if (state.lastPosBlockOffset != -1) { - flag |= HAS_VINT_POSITION_BLOCK_MASK; - } - return new TermType(flag); - } - - static TermType fromId(int id) { - if (id < 0 || id > 8) { - throw new IllegalArgumentException("id must be within range [0, 8]"); - } - return new TermType((byte) id); - } - - @Override - public int hashCode() { - return Objects.hashCode(this.flag); - } - - @Override - public boolean equals(Object that) { - return that instanceof TermType && - ((TermType) that).flag == this.flag; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java deleted file mode 100644 index ce7fe87207ce..000000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.codecs.lucene90.radomaccess; - -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FSTCompiler; -import org.apache.lucene.util.fst.PositiveIntOutputs; -import org.apache.lucene.util.fst.Util; - -import java.io.IOException; -import java.util.Arrays; - -/** - * Builds a term index for a given field. Logically this is a map: term -> (type, ord) where the ordinals - * are scoped to type (not global). - */ -final class TermsIndexBuilder { - private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; - private final FSTCompiler fstCompiler = - new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); - - TermsIndexBuilder() { - Arrays.fill(countPerType, -1); - } - - public void addTerm(BytesRef term, TermType termType) throws IOException { - IntsRefBuilder scratchInts = new IntsRefBuilder(); - long ord = ++countPerType[termType.getId()]; - fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType)); - } - - public TermsIndex build() throws IOException { - return new TermsIndex(fstCompiler.compile()); - } - - private long encode(long ord, TermType termType) { - // use a single long to encode `ord` and `termType` - // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0` - // so it looks like this |... ord ...| termType| ... hasOutput ...| - // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord - if ( ord < 0) { - throw new IllegalArgumentException("can't encode negative ord"); - } - if ( ord > ((1L << 60) - 1) ) { - throw new IllegalArgumentException("Input ord is too large"); - } - return (ord << 4) | ((long) termType.getId() << 1) | 1L; - } -} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java deleted file mode 100644 index b8fd67ac64cc..000000000000 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.codecs.lucene90.radomaccess; - -import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Util; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -public class TestTermsIndexBuilder extends LuceneTestCase { - - public void testBasics() throws IOException { - String[] test_terms = { - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - }; - - Map termsToType = new HashMap<>(); - Map termsToOrd = new HashMap<>(); - Map typeCounters = new HashMap<>(); - - for (String term : test_terms) { - int termType = random().nextInt(TermType.NUM_TOTAL_TYPES); - termsToType.put(term, termType); - int ord = typeCounters.getOrDefault(termType, -1) + 1; - typeCounters.put(termType, ord); - termsToOrd.put(term, ord); - } - - TermsIndexBuilder builder = new TermsIndexBuilder(); - for (String term : test_terms) { - BytesRef termBytes = new BytesRef(term); - builder.addTerm(termBytes, TermType.fromId(termsToType.get(term))); - } - TermsIndex termsIndex = builder.build(); - - FST fst = termsIndex.fst(); - - for (String term : test_terms) { - BytesRef termBytes = new BytesRef(term); - long encoded = Util.get(fst, termBytes); - - assertEquals(1L, encoded & 0b1L); - assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1); - assertEquals((long) termsToOrd.get(term), encoded >> 4); - } - - } - -} \ No newline at end of file diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index c51a25691ef2..96522ed7a5f7 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -22,6 +22,7 @@ exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; + exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess; exports org.apache.lucene.sandbox.document; exports org.apache.lucene.sandbox.queries; exports org.apache.lucene.sandbox.search; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java index 7d770ceb7f26..60c292706a30 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.radomaccess; +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; import java.io.IOException; import org.apache.lucene.codecs.FieldsConsumer; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java similarity index 95% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java index 7fa08663baba..d5214561bf26 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.radomaccess; +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; import java.io.IOException; import java.util.Iterator; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java similarity index 94% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java index 19bdf35845ee..c18a0cbbd143 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.radomaccess; +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; import java.io.IOException; import org.apache.lucene.codecs.FieldsConsumer; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java new file mode 100644 index 000000000000..d52cace8545d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import java.util.Objects; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; + +/** + * TermType holds the classification of a term, based on how its postings are written. + * + *

It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2) + * if the term has skip data. 3) if the term as an VINT encoded position block. + */ +final class TermType { + private static final byte SINGLETON_DOC_MASK = (byte) 1; + + private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1; + + private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2; + + public static final int NUM_TOTAL_TYPES = 8; + + private final byte flag; + + private TermType(byte flag) { + this.flag = flag; + } + + int getId() { + assert this.flag >= 0 && this.flag <= 8; + return this.flag; + } + + boolean hasSingletonDoc() { + return (this.flag & SINGLETON_DOC_MASK) > 0; + } + + boolean hasSkipData() { + return (this.flag & HAS_SKIP_DATA_MASK) > 0; + } + + boolean hasVintPositionBlock() { + return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0; + } + + static TermType fromTermState(IntBlockTermState state) { + byte flag = 0; + if (state.singletonDocID != -1) { + flag |= SINGLETON_DOC_MASK; + } + if (state.skipOffset != -1) { + flag |= HAS_SKIP_DATA_MASK; + } + if (state.lastPosBlockOffset != -1) { + flag |= HAS_VINT_POSITION_BLOCK_MASK; + } + return new TermType(flag); + } + + static TermType fromId(int id) { + if (id < 0 || id > 8) { + throw new IllegalArgumentException("id must be within range [0, 8]"); + } + return new TermType((byte) id); + } + + @Override + public int hashCode() { + return Objects.hashCode(this.flag); + } + + @Override + public boolean equals(Object that) { + return that instanceof TermType && ((TermType) that).flag == this.flag; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java similarity index 89% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java index 27de75bf10d6..94fce6559bc4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java @@ -15,10 +15,8 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.radomaccess; +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; import org.apache.lucene.util.fst.FST; - -record TermsIndex(FST fst) { -} \ No newline at end of file +record TermsIndex(FST fst) {} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java new file mode 100644 index 000000000000..8077de7682ce --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; + +/** + * Builds a term index for a given field. Logically this is a map: term -> (type, ord) where the + * ordinals are scoped to type (not global). + */ +final class TermsIndexBuilder { + private static long MAX_ORD = (1L << 60) - 1; + + private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; + private final FSTCompiler fstCompiler = + new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); + + TermsIndexBuilder() { + Arrays.fill(countPerType, -1); + } + + public void addTerm(BytesRef term, TermType termType) throws IOException { + IntsRefBuilder scratchInts = new IntsRefBuilder(); + long ord = ++countPerType[termType.getId()]; + fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType)); + } + + public TermsIndex build() throws IOException { + return new TermsIndex(fstCompiler.compile()); + } + + private long encode(long ord, TermType termType) { + // use a single long to encode `ord` and `termType` + // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0` + // so it looks like this |... ord ...| termType| ... hasOutput ...| + // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord + if (ord < 0) { + throw new IllegalArgumentException("can't encode negative ord"); + } + if (ord > MAX_ORD) { + throw new IllegalArgumentException( + "Input ord is too large for TermType: " + + termType.getId() + + ", max ord allowed is 2^60 - 1"); + } + return (ord << 4) | ((long) termType.getId() << 1) | 1L; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java new file mode 100644 index 000000000000..d5cf9583f91c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A PostingFormat that is based on {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat} + * but provides random access term dictionary. + */ +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java new file mode 100644 index 000000000000..43f4010b1ae6 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; + +public class TestTermsIndexBuilder extends LuceneTestCase { + + public void testBasics() throws IOException { + String[] test_terms = { + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + }; + + Map termsToType = new HashMap<>(); + Map termsToOrd = new HashMap<>(); + Map typeCounters = new HashMap<>(); + + for (String term : test_terms) { + int termType = random().nextInt(TermType.NUM_TOTAL_TYPES); + termsToType.put(term, termType); + int ord = typeCounters.getOrDefault(termType, -1) + 1; + typeCounters.put(termType, ord); + termsToOrd.put(term, ord); + } + + TermsIndexBuilder builder = new TermsIndexBuilder(); + for (String term : test_terms) { + BytesRef termBytes = new BytesRef(term); + builder.addTerm(termBytes, TermType.fromId(termsToType.get(term))); + } + TermsIndex termsIndex = builder.build(); + + FST fst = termsIndex.fst(); + + for (String term : test_terms) { + BytesRef termBytes = new BytesRef(term); + long encoded = Util.get(fst, termBytes); + + assertEquals(1L, encoded & 0b1L); + assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1); + assertEquals((long) termsToOrd.get(term), encoded >> 4); + } + } +} From b758ec57a1ffdd7d3519553a672e1d8ebe270956 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 26 Oct 2023 16:12:10 -0700 Subject: [PATCH 06/57] Add interfaces for encoding/decoding TermStates motivation: We will need to deal with encoding `IntBlockTermState` for different type of terms. Instead of having dedicated class for each term type, which would be 8 types in total, we can spell out the individual components of `IntBlockTermState`. Then implement a codec which works with the composition of the components. This way we can have a single implementation of the codec and construct the composition (really just array of components) per term type. --- .../lucene90/randomaccess/TermStateCodec.java | 39 ++++ .../randomaccess/TermStateCodecComponent.java | 182 ++++++++++++++++++ .../randomaccess/TermStateCodecImpl.java | 40 ++++ .../randomaccess/bitpacking/BitPacker.java | 23 +++ .../randomaccess/bitpacking/BitUnpacker.java | 23 +++ .../randomaccess/bitpacking/package-info.java | 19 ++ .../TestTermStateCodecComponent.java | 43 +++++ 7 files changed, 369 insertions(+) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java new file mode 100644 index 000000000000..38a024c8e2b6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.util.BytesRef; + +interface TermStateCodec { + + /** + * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker + * + * @return the metadata associated with the encoded bytes + */ + byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker); + + /** + * Decode out a {@link IntBlockTermState} with provided metadata bye slice and data byte slice + * + * @return the decoded term state + */ + IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes); +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java new file mode 100644 index 000000000000..97ef6c9ecaeb --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; + +interface TermStateCodecComponent { + + static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) { + assert termStates.length > 0; + + long maxValSeen = -1; + for (var termState : termStates) { + maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState)); + } + return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen)); + } + + boolean isMonotonicallyIncreasing(); + + long getTargetValue(IntBlockTermState termState); + + void setTargetValue(IntBlockTermState termState, long value); + + final class SingletonDocId implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.singletonDocID; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + assert value <= Integer.MAX_VALUE; + // A correct codec implementation does not change the value, + // after the encode/decode round-trip it should still be a valid int + termState.singletonDocID = (int) value; + } + } + + /** Below are the relevant IntBlockTermState components * */ + final class DocFreq implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.docFreq; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + assert value <= Integer.MAX_VALUE; + // A correct codec implementation does not change the value, + // after the encode/decode round-trip it should still be a valid int + termState.docFreq = (int) value; + } + } + + final class TotalTermFreq implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.totalTermFreq; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.totalTermFreq = value; + } + } + + final class DocStartFP implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return true; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.docStartFP; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.docStartFP = value; + } + } + + final class PositionStartFP implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return true; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.posStartFP; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.posStartFP = value; + } + } + + final class PayloadStartFP implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return true; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.payStartFP; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.payStartFP = value; + } + } + + final class SkipOffset implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.skipOffset; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.skipOffset = value; + } + } + + final class LastPositionBlockOffset implements TermStateCodecComponent { + @Override + public boolean isMonotonicallyIncreasing() { + return false; + } + + @Override + public long getTargetValue(IntBlockTermState termState) { + return termState.lastPosBlockOffset; + } + + @Override + public void setTargetValue(IntBlockTermState termState, long value) { + termState.lastPosBlockOffset = value; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java new file mode 100644 index 000000000000..4481cb31e613 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.util.BytesRef; + +final class TermStateCodecImpl implements TermStateCodec { + private final TermStateCodecComponent[] components; + + public TermStateCodecImpl(TermStateCodecComponent[] components) { + this.components = components; + } + + @Override + public byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker) { + return new byte[0]; + } + + @Override + public IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes) { + return null; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java new file mode 100644 index 000000000000..3841278b7840 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; + +public interface BitPacker { + + void add(long value, int numBits); +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java new file mode 100644 index 000000000000..8a0bd580dd91 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; + +public interface BitUnpacker { + + long unpack(byte[] data, int startBitIndex, int bitWidth); +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java new file mode 100644 index 000000000000..866d071788ac --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Code for packing and unpacking sequence of non-negative integers with smaller bit width. */ +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java new file mode 100644 index 000000000000..2f3457bdfdff --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestTermStateCodecComponent extends LuceneTestCase { + + public void testGetBitWidth() { + int expectedMaxBits = random().nextInt(31) + 1; + int bitMask = 0xFFFFFFFF >>> (32 - expectedMaxBits); + IntBlockTermState[] termStates = + random() + .ints() + .limit(100) + .mapToObj( + docFreq -> { + var x = new IntBlockTermState(); + x.docFreq = docFreq & bitMask; + return x; + }) + .toArray(IntBlockTermState[]::new); + byte bitWidth = + TermStateCodecComponent.getBitWidth(termStates, new TermStateCodecComponent.DocFreq()); + assertTrue(bitWidth <= expectedMaxBits); + } +} From 7d35ed239c2f27223b562390dcce2097efc5116d Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Fri, 27 Oct 2023 12:10:58 -0700 Subject: [PATCH 07/57] Make the concrete TermStateCodecComponents singletons --- .../randomaccess/TermStateCodecComponent.java | 58 ++++++++++++++----- .../TestTermStateCodecComponent.java | 2 +- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java index 97ef6c9ecaeb..da1ee77b7ece 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java @@ -19,7 +19,7 @@ import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; -interface TermStateCodecComponent { +abstract class TermStateCodecComponent { static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) { assert termStates.length > 0; @@ -31,13 +31,18 @@ static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen)); } - boolean isMonotonicallyIncreasing(); + abstract boolean isMonotonicallyIncreasing(); - long getTargetValue(IntBlockTermState termState); + abstract long getTargetValue(IntBlockTermState termState); - void setTargetValue(IntBlockTermState termState, long value); + abstract void setTargetValue(IntBlockTermState termState, long value); + + /** Below are the relevant IntBlockTermState components * */ + static final class SingletonDocId extends TermStateCodecComponent { + public static SingletonDocId INSTANCE = new SingletonDocId(); + + private SingletonDocId() {} - final class SingletonDocId implements TermStateCodecComponent { @Override public boolean isMonotonicallyIncreasing() { return false; @@ -57,8 +62,11 @@ public void setTargetValue(IntBlockTermState termState, long value) { } } - /** Below are the relevant IntBlockTermState components * */ - final class DocFreq implements TermStateCodecComponent { + static final class DocFreq extends TermStateCodecComponent { + public static DocFreq INSTANCE = new DocFreq(); + + private DocFreq() {} + @Override public boolean isMonotonicallyIncreasing() { return false; @@ -78,7 +86,11 @@ public void setTargetValue(IntBlockTermState termState, long value) { } } - final class TotalTermFreq implements TermStateCodecComponent { + static final class TotalTermFreq extends TermStateCodecComponent { + public static TotalTermFreq INSTANCE = new TotalTermFreq(); + + private TotalTermFreq() {} + @Override public boolean isMonotonicallyIncreasing() { return false; @@ -95,7 +107,11 @@ public void setTargetValue(IntBlockTermState termState, long value) { } } - final class DocStartFP implements TermStateCodecComponent { + static final class DocStartFP extends TermStateCodecComponent { + public static DocStartFP INSTANCE = new DocStartFP(); + + private DocStartFP() {} + @Override public boolean isMonotonicallyIncreasing() { return true; @@ -112,7 +128,11 @@ public void setTargetValue(IntBlockTermState termState, long value) { } } - final class PositionStartFP implements TermStateCodecComponent { + static final class PositionStartFP extends TermStateCodecComponent { + public static PositionStartFP INSTANCE = new PositionStartFP(); + + private PositionStartFP() {} + @Override public boolean isMonotonicallyIncreasing() { return true; @@ -129,7 +149,11 @@ public void setTargetValue(IntBlockTermState termState, long value) { } } - final class PayloadStartFP implements TermStateCodecComponent { + static final class PayloadStartFP extends TermStateCodecComponent { + public static PayloadStartFP INSTANCE = new PayloadStartFP(); + + private PayloadStartFP() {} + @Override public boolean isMonotonicallyIncreasing() { return true; @@ -146,7 +170,11 @@ public void setTargetValue(IntBlockTermState termState, long value) { } } - final class SkipOffset implements TermStateCodecComponent { + static final class SkipOffset extends TermStateCodecComponent { + public static SkipOffset INSTANCE = new SkipOffset(); + + private SkipOffset() {} + @Override public boolean isMonotonicallyIncreasing() { return false; @@ -163,7 +191,11 @@ public void setTargetValue(IntBlockTermState termState, long value) { } } - final class LastPositionBlockOffset implements TermStateCodecComponent { + static final class LastPositionBlockOffset extends TermStateCodecComponent { + public static LastPositionBlockOffset INSTANCE = new LastPositionBlockOffset(); + + private LastPositionBlockOffset() {} + @Override public boolean isMonotonicallyIncreasing() { return false; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java index 2f3457bdfdff..ab2bebf252a2 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java @@ -37,7 +37,7 @@ public void testGetBitWidth() { }) .toArray(IntBlockTermState[]::new); byte bitWidth = - TermStateCodecComponent.getBitWidth(termStates, new TermStateCodecComponent.DocFreq()); + TermStateCodecComponent.getBitWidth(termStates, TermStateCodecComponent.DocFreq.INSTANCE); assertTrue(bitWidth <= expectedMaxBits); } } From 6a1506b80668188ec08728ac5981bb64ffd2e267 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 30 Oct 2023 14:45:34 -0700 Subject: [PATCH 08/57] Fix the expected export module check --- lucene/sandbox/src/java/module-info.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 96522ed7a5f7..59331969cce1 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -23,6 +23,7 @@ exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; exports org.apache.lucene.sandbox.document; exports org.apache.lucene.sandbox.queries; exports org.apache.lucene.sandbox.search; From e06f30362e8be8e8d750ceb4eccd0b9a82242161 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 30 Oct 2023 14:46:14 -0700 Subject: [PATCH 09/57] Implment TermStateCodecComponent.getBitWidth for monotonically increasing values --- .../randomaccess/TermStateCodecComponent.java | 5 ++- .../TestTermStateCodecComponent.java | 39 +++++++++++++++++-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java index da1ee77b7ece..9d93f40dc4b0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java @@ -25,8 +25,11 @@ static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent assert termStates.length > 0; long maxValSeen = -1; + long referenceValue = + component.isMonotonicallyIncreasing() ? component.getTargetValue(termStates[0]) : 0; + for (var termState : termStates) { - maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState)); + maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState) - referenceValue); } return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen)); } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java index ab2bebf252a2..862996fb6c30 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java @@ -17,6 +17,7 @@ package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +import java.util.stream.LongStream; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; import org.apache.lucene.tests.util.LuceneTestCase; @@ -25,19 +26,49 @@ public class TestTermStateCodecComponent extends LuceneTestCase { public void testGetBitWidth() { int expectedMaxBits = random().nextInt(31) + 1; int bitMask = 0xFFFFFFFF >>> (32 - expectedMaxBits); + int highestBit = (bitMask >>> 1) + 1; + IntBlockTermState[] termStates = random() - .ints() - .limit(100) + .ints(256) .mapToObj( docFreq -> { var x = new IntBlockTermState(); - x.docFreq = docFreq & bitMask; + x.docFreq = (docFreq & bitMask) | highestBit; return x; }) .toArray(IntBlockTermState[]::new); + byte bitWidth = TermStateCodecComponent.getBitWidth(termStates, TermStateCodecComponent.DocFreq.INSTANCE); - assertTrue(bitWidth <= expectedMaxBits); + assertEquals(expectedMaxBits, bitWidth); + } + + public void testGetBitWidthWithIncreasingValues() { + long baseValue = random().nextLong(Long.MAX_VALUE >> 1); + int expectedMaxBits = random().nextInt(63) + 1; + long bitMask = 0xFFFFFFFF_FFFFFFFFL >>> (64 - expectedMaxBits); + long highestBit = (bitMask >>> 1) + 1; + + var randomLongs = + random() + .longs(256, 0, Long.MAX_VALUE - baseValue) + .map(x -> baseValue + ((x & bitMask) | highestBit)) + .sorted(); + + IntBlockTermState[] termStates = + LongStream.concat(LongStream.of(baseValue), randomLongs) + .mapToObj( + docStartFP -> { + var x = new IntBlockTermState(); + x.docStartFP = docStartFP; + return x; + }) + .toArray(IntBlockTermState[]::new); + + byte bitWidth = + TermStateCodecComponent.getBitWidth( + termStates, TermStateCodecComponent.DocStartFP.INSTANCE); + assertEquals(expectedMaxBits, bitWidth); } } From ea2c76f6ea1a7fa88ca83e0749584f4504974747 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Wed, 1 Nov 2023 21:13:53 -0700 Subject: [PATCH 10/57] Implement a codec (not Lucene Codec) for IntBlockTermState TermStateCodecImpl implements TermStateCodec which supports encoding a block of IntBlockTermState and decoding within that block at a given index. --- .../lucene90/randomaccess/TermStateCodec.java | 12 +- .../randomaccess/TermStateCodecImpl.java | 116 ++++++++++++++++- .../randomaccess/bitpacking/BitUnpacker.java | 4 +- .../randomaccess/TestTermStateCodecImpl.java | 120 ++++++++++++++++++ 4 files changed, 245 insertions(+), 7 deletions(-) create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java index 38a024c8e2b6..7d1cb0dd6ae6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java @@ -19,21 +19,27 @@ import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker; import org.apache.lucene.util.BytesRef; interface TermStateCodec { /** - * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker + * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker into a block of + * bytes. * * @return the metadata associated with the encoded bytes */ byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker); /** - * Decode out a {@link IntBlockTermState} with provided metadata bye slice and data byte slice + * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and + * data byte slice, at the given index within an encoded block. + * + *

Note: This method expects the dataBytes contains the bytes for the whole block. * * @return the decoded term state */ - IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes); + IntBlockTermState decodeWithinBlock( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java index 4481cb31e613..5bd3730ed3fa 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java @@ -19,22 +19,132 @@ import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.util.BytesRef; final class TermStateCodecImpl implements TermStateCodec { private final TermStateCodecComponent[] components; + private final int metadataBytesLength; + + private static int getMetadataLength(TermStateCodecComponent component) { + // 1 byte for bitWidth; optionally 8 byte more for the reference value + return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0); + } public TermStateCodecImpl(TermStateCodecComponent[] components) { + assert components.length > 0; + this.components = components; + int metadataBytesLength = 0; + for (var component : components) { + metadataBytesLength += getMetadataLength(component); + } + this.metadataBytesLength = metadataBytesLength; } @Override public byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker) { - return new byte[0]; + Metadata[] metadataPerComponent = getMetadataPerComponent(inputs); + byte[] metadataBytes = serializeMetadata(metadataPerComponent); + + // Encode inputs via the bitpacker + for (var termState : inputs) { + encodeOne(bitPacker, termState, metadataPerComponent); + } + + return metadataBytes; + } + + private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs) { + Metadata[] metadataPerComponent = new Metadata[components.length]; + for (int i = 0; i < components.length; i++) { + var component = components[i]; + byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, component); + long referenceValue = + component.isMonotonicallyIncreasing() ? component.getTargetValue(inputs[0]) : 0L; + metadataPerComponent[i] = new Metadata(bitWidth, referenceValue); + } + return metadataPerComponent; + } + + private byte[] serializeMetadata(Metadata[] metadataPerComponent) { + byte[] metadataBytes = new byte[this.metadataBytesLength]; + ByteArrayDataOutput dataOut = new ByteArrayDataOutput(metadataBytes); + + for (int i = 0; i < components.length; i++) { + var metadata = metadataPerComponent[i]; + dataOut.writeByte(metadata.bitWidth); + if (components[i].isMonotonicallyIncreasing()) { + dataOut.writeLong(metadata.referenceValue); + } + } + return metadataBytes; + } + + private void encodeOne( + BitPacker bitPacker, IntBlockTermState termState, Metadata[] metadataPerComponent) { + for (int i = 0; i < components.length; i++) { + var component = components[i]; + var metadata = metadataPerComponent[i]; + long valToEncode = component.getTargetValue(termState) - metadata.referenceValue; + bitPacker.add(valToEncode, metadata.bitWidth); + } } @Override - public IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes) { - return null; + public IntBlockTermState decodeWithinBlock( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index) { + assert metadataBytes.length == this.metadataBytesLength; + + var metadata = deserializedMetadata(metadataBytes); + + int startBitIndex = index * metadata.totalBitsPerTermState; + return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent); } + + private MetadataAndTotalBitsPerTermState deserializedMetadata(BytesRef metadataBytes) { + Metadata[] metadataPerComponent = new Metadata[components.length]; + ByteArrayDataInput byteArrayDataInput = + new ByteArrayDataInput(metadataBytes.bytes, metadataBytes.offset, metadataBytes.length); + int totalBitsPerTermState = 0; + for (int i = 0; i < components.length; i++) { + var component = components[i]; + byte bitWidth = byteArrayDataInput.readByte(); + long referenceValue = -1; + if (component.isMonotonicallyIncreasing()) { + referenceValue = byteArrayDataInput.readLong(); + } + metadataPerComponent[i] = new Metadata(bitWidth, referenceValue); + + totalBitsPerTermState += bitWidth; + } + + return new MetadataAndTotalBitsPerTermState(metadataPerComponent, totalBitsPerTermState); + } + + private IntBlockTermState extract( + BytesRef dataBytes, + BitUnpacker bitUnpacker, + int startBitIndex, + Metadata[] metadataPerComponent) { + IntBlockTermState decoded = new IntBlockTermState(); + for (int i = 0; i < components.length; i++) { + var component = components[i]; + var metadata = metadataPerComponent[i]; + long val = bitUnpacker.unpack(dataBytes, startBitIndex, metadata.bitWidth); + if (metadata.referenceValue > 0) { + val += metadata.referenceValue; + } + component.setTargetValue(decoded, val); + startBitIndex += metadata.bitWidth; + } + return decoded; + } + + private record Metadata(byte bitWidth, long referenceValue) {} + + private record MetadataAndTotalBitsPerTermState( + Metadata[] metadataPerComponent, int totalBitsPerTermState) {} } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java index 8a0bd580dd91..35fc1612790a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java @@ -17,7 +17,9 @@ package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +import org.apache.lucene.util.BytesRef; + public interface BitUnpacker { - long unpack(byte[] data, int startBitIndex, int bitWidth); + long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth); } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java new file mode 100644 index 000000000000..92e2700c74b2 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; + +import java.util.ArrayList; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestTermStateCodecImpl extends LuceneTestCase { + + public void testEncodeDecode() { + TermStateCodecImpl codec = + new TermStateCodecImpl( + new TermStateCodecComponent[] { + TermStateCodecComponent.DocFreq.INSTANCE, TermStateCodecComponent.DocStartFP.INSTANCE, + }); + + ArrayList termStates = new ArrayList<>(); + long maxDocFreqSeen = -1; + long docStartFPBase = random().nextLong(Long.MAX_VALUE >> 1); + long maxDocStartFPDeltaSeen = -1; + for (int i = 0; i < random().nextInt(2, 256); i++) { + var termState = new IntBlockTermState(); + termState.docFreq = random().nextInt(1, Integer.MAX_VALUE); + if (i == 0) { + termState.docStartFP = docStartFPBase; + } else { + termState.docStartFP = termStates.get(i - 1).docStartFP + random().nextLong(1024); + maxDocStartFPDeltaSeen = + Math.max(maxDocStartFPDeltaSeen, termState.docStartFP - docStartFPBase); + } + maxDocFreqSeen = Math.max(maxDocFreqSeen, termState.docFreq); + termStates.add(termState); + } + + IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new); + + BitPerBytePacker bitPerBytePacker = new BitPerBytePacker(); + byte[] metadata = codec.encode(termStatesArray, bitPerBytePacker); + + // For the metadata, we expect + // 0: DocFreq.bitWidth, + // 1: DocStartFP.bitWidth, + // [2-10]: DocStartFP.referenceValue; + assertEquals(10, metadata.length); + assertEquals(64 - Long.numberOfLeadingZeros(maxDocFreqSeen), metadata[0]); + assertEquals(64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen), metadata[1]); + ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8); + assertEquals(docStartFPBase, byteArrayDataInput.readLong()); + + // Assert that each term state is the same after the encode-decode roundtrip. + BytesRef metadataBytes = new BytesRef(metadata); + BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes()); + for (int i = 0; i < termStatesArray.length; i++) { + IntBlockTermState decoded = + codec.decodeWithinBlock(metadataBytes, dataBytes, bitPerBytePacker, i); + assertEquals(termStatesArray[i].docFreq, decoded.docFreq); + assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP); + } + } +} + +/** + * A wasteful bit packer that use whole byte to keep a bit. Useful for tests. It uses little-endian + * bit order. + */ +class BitPerBytePacker implements BitPacker, BitUnpacker { + private final ArrayList buffer = new ArrayList<>(); + + private int totalNumBits = 0; + + @Override + public void add(long value, int numBits) { + assert numBits < 64; + totalNumBits += numBits; + while (numBits-- > 0) { + byte b = (byte) (value & 1L); + value = value >>> 1; + buffer.add(b); + } + } + + public byte[] getBytes() { + byte[] bytes = new byte[totalNumBits]; + int index = 0; + for (var b : buffer) { + bytes[index++] = b; + } + + return bytes; + } + + @Override + public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) { + long res = 0; + for (int i = 0; i < bitWidth; i++) { + res |= ((long) (bytesRef.bytes[bytesRef.offset + startBitIndex + i] & 1)) << i; + } + return res; + } +} From c87713c47577ba435736e9b62e84ba58ce7ec024 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 2 Nov 2023 11:31:23 -0700 Subject: [PATCH 11/57] Add more javadoc and minor re-naming --- .../codecs/lucene90/randomaccess/TermStateCodec.java | 6 ++++-- .../codecs/lucene90/randomaccess/TermStateCodecImpl.java | 2 +- .../codecs/lucene90/randomaccess/bitpacking/BitPacker.java | 2 ++ .../lucene90/randomaccess/bitpacking/BitUnpacker.java | 2 ++ .../lucene90/randomaccess/TestTermStateCodecImpl.java | 2 +- 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java index 7d1cb0dd6ae6..9b48c00cd54d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java @@ -30,13 +30,15 @@ interface TermStateCodec { * * @return the metadata associated with the encoded bytes */ - byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker); + byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker); /** * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and * data byte slice, at the given index within an encoded block. * - *

Note: This method expects the dataBytes contains the bytes for the whole block. + *

Note: This method expects dataBytes that starts at the start of the block. Also, dataBytes + * should contain enough bytes (but not necessarily the whole block) to decode at the term state + * at `index`. * * @return the decoded term state */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java index 5bd3730ed3fa..0e55b1235fa7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java @@ -45,7 +45,7 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) { } @Override - public byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker) { + public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) { Metadata[] metadataPerComponent = getMetadataPerComponent(inputs); byte[] metadataBytes = serializeMetadata(metadataPerComponent); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java index 3841278b7840..a1828c69a032 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java @@ -17,7 +17,9 @@ package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +/** Interface for bit-packing */ public interface BitPacker { + /** Pack the low `numBits` bits of `value` */ void add(long value, int numBits); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java index 35fc1612790a..7c9448d893b5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java @@ -19,7 +19,9 @@ import org.apache.lucene.util.BytesRef; +/** Interface for bit-unpacking */ public interface BitUnpacker { + /** Unpack a long in the given bytesRef from a range of bits. */ long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth); } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java index 92e2700c74b2..9d7bdff06b3f 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java @@ -55,7 +55,7 @@ public void testEncodeDecode() { IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new); BitPerBytePacker bitPerBytePacker = new BitPerBytePacker(); - byte[] metadata = codec.encode(termStatesArray, bitPerBytePacker); + byte[] metadata = codec.encodeBlock(termStatesArray, bitPerBytePacker); // For the metadata, we expect // 0: DocFreq.bitWidth, From 322a0f0eca201861c43b3c5cb48658ac7251cc94 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 2 Nov 2023 13:53:47 -0700 Subject: [PATCH 12/57] TestTermStateCodecImpl to decode at non-block starting positions --- .../lucene90/randomaccess/TermStateCodec.java | 20 +++++++++++++++++++ .../randomaccess/TermStateCodecImpl.java | 9 +++++++++ .../randomaccess/TestTermStateCodecImpl.java | 19 ++++++++++++++++-- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java index 9b48c00cd54d..a203bdc180e0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java @@ -44,4 +44,24 @@ interface TermStateCodec { */ IntBlockTermState decodeWithinBlock( BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index); + + /** + * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and + * data byte slice, starting at `startBitIndex`. + * + *

Note: The dataBytes should contain enough bits to decode out the term state. passing more + * bytes than needed is fine but excessive ones are not used. + * + *

e.g. we want to decode a term state which contains value x, y and z, that has 18 bits in + * total. Assume x takes 4 bit, y takes 4 bit and z takes 10 bits. + * + *

Here is the visualization wh en we decode with startBitIndex=7 + * + *

+   *     Note: little-endian bit order
+   *     [x.......][zyyyyxxx][zzzzzzzz][.......z]
+   * 
+ */ + IntBlockTermState decodeAt( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java index 0e55b1235fa7..eea9e1b149a8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java @@ -104,6 +104,15 @@ public IntBlockTermState decodeWithinBlock( return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent); } + @Override + public IntBlockTermState decodeAt( + BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex) { + assert metadataBytes.length == this.metadataBytesLength; + + var metadata = deserializedMetadata(metadataBytes); + return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent); + } + private MetadataAndTotalBitsPerTermState deserializedMetadata(BytesRef metadataBytes) { Metadata[] metadataPerComponent = new Metadata[components.length]; ByteArrayDataInput byteArrayDataInput = diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java index 9d7bdff06b3f..6be829d621ff 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java @@ -61,9 +61,11 @@ public void testEncodeDecode() { // 0: DocFreq.bitWidth, // 1: DocStartFP.bitWidth, // [2-10]: DocStartFP.referenceValue; + int expectedDocFreqBitWidth = 64 - Long.numberOfLeadingZeros(maxDocFreqSeen); + int expectedDocStartFPBitWidth = 64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen); assertEquals(10, metadata.length); - assertEquals(64 - Long.numberOfLeadingZeros(maxDocFreqSeen), metadata[0]); - assertEquals(64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen), metadata[1]); + assertEquals(expectedDocFreqBitWidth, metadata[0]); + assertEquals(expectedDocStartFPBitWidth, metadata[1]); ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8); assertEquals(docStartFPBase, byteArrayDataInput.readLong()); @@ -76,6 +78,19 @@ public void testEncodeDecode() { assertEquals(termStatesArray[i].docFreq, decoded.docFreq); assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP); } + + // Also test decoding that doesn't begin at the start of the block. + int pos = random().nextInt(termStatesArray.length); + int startBitIndex = random().nextInt(pos); + dataBytes = + new BytesRef( + bitPerBytePacker.getBytes(), + pos * (expectedDocFreqBitWidth + expectedDocStartFPBitWidth) - startBitIndex, + expectedDocFreqBitWidth + expectedDocStartFPBitWidth); + IntBlockTermState decoded = + codec.decodeAt(metadataBytes, dataBytes, bitPerBytePacker, startBitIndex); + assertEquals(termStatesArray[pos].docFreq, decoded.docFreq); + assertEquals(termStatesArray[pos].docStartFP, decoded.docStartFP); } } From 0976ce769cfc264f39a006f1e35071dcb9aa3cab Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 6 Nov 2023 12:15:47 -0800 Subject: [PATCH 13/57] Implement compact BitUnpacker --- .../bitpacking/BitUnpackerImpl.java | 79 +++++++++++++++ .../randomaccess/TestTermStateCodecImpl.java | 99 +++++++++---------- .../bitpacking/BitPerBytePacker.java | 89 +++++++++++++++++ .../bitpacking/TestBitUnpackerImpl.java | 62 ++++++++++++ 4 files changed, 278 insertions(+), 51 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java new file mode 100644 index 000000000000..b4cfd54f584a --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; + +import org.apache.lucene.util.BytesRef; + +public class BitUnpackerImpl implements BitUnpacker { + public static BitUnpackerImpl INSTANCE = new BitUnpackerImpl(); + + private BitUnpackerImpl() {} + + @Override + public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) { + assert (startBitIndex + bitWidth) <= bytesRef.length * 8; + assert bitWidth < 64; + + int firstByteIndex = startBitIndex / 8; + int numBitsToExcludeInFirstByte = startBitIndex % 8; + int lastByteIndex = (startBitIndex + bitWidth) / 8; + int numBitsToKeepInLastByte = (startBitIndex + bitWidth) % 8; + + /* + * idea: there are two cases + * (1) when the requests bits are within the same byte; e.g. startBitIndex = 1, bitWidth = 5 + * (2) when the requests bits span across many bytes; e.g. startBitIndex = 1, bitWidth = 15 + * For (1) it is trivial, + * for (2) we can + * (2.1) read first partial bytes + * (2.2) read full bytes for those whose index is in (first, last), exclusive. + * (2.3) read the last partial bytes ( can be empty ) + */ + + // case (1) + if (firstByteIndex == lastByteIndex) { + long res = Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + firstByteIndex]); + res &= (1L << numBitsToKeepInLastByte) - 1; + res >>>= numBitsToExcludeInFirstByte; + return res; + } + + // case (2) + long res = 0; + int totalNumBitsRead = 0; + // (2.1) read first partial bytes + res |= + Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + firstByteIndex]) + >>> numBitsToExcludeInFirstByte; + totalNumBitsRead += 8 - numBitsToExcludeInFirstByte; + // (2.2) read full bytes for whose index is in (first, last), exclusive. + for (int byteIndex = firstByteIndex + 1; byteIndex < lastByteIndex; byteIndex++) { + res |= Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + byteIndex]) << totalNumBitsRead; + totalNumBitsRead += 8; + } + // (2.3) read the last partial bytes ( can be empty ) + if (numBitsToKeepInLastByte > 0) { + long partial = + Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + lastByteIndex]) + & ((1L << numBitsToKeepInLastByte) - 1); + res |= partial << totalNumBitsRead; + } + + return res; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java index 6be829d621ff..298c4a4a419d 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java @@ -19,8 +19,9 @@ import java.util.ArrayList; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPerBytePacker; import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpackerImpl; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; @@ -40,7 +41,7 @@ public void testEncodeDecode() { long maxDocStartFPDeltaSeen = -1; for (int i = 0; i < random().nextInt(2, 256); i++) { var termState = new IntBlockTermState(); - termState.docFreq = random().nextInt(1, Integer.MAX_VALUE); + termState.docFreq = random().nextInt(1, 1 << random().nextInt(1, 31)); if (i == 0) { termState.docStartFP = docStartFPBase; } else { @@ -72,64 +73,60 @@ public void testEncodeDecode() { // Assert that each term state is the same after the encode-decode roundtrip. BytesRef metadataBytes = new BytesRef(metadata); BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes()); - for (int i = 0; i < termStatesArray.length; i++) { - IntBlockTermState decoded = - codec.decodeWithinBlock(metadataBytes, dataBytes, bitPerBytePacker, i); - assertEquals(termStatesArray[i].docFreq, decoded.docFreq); - assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP); - } + assertBlockRoundTrip(termStatesArray, codec, metadataBytes, dataBytes, bitPerBytePacker); + + // With real compact bits instead of bit-per-byte + dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes()); + assertBlockRoundTrip( + termStatesArray, codec, metadataBytes, dataBytes, BitUnpackerImpl.INSTANCE); // Also test decoding that doesn't begin at the start of the block. int pos = random().nextInt(termStatesArray.length); int startBitIndex = random().nextInt(pos); + int recordSize = expectedDocFreqBitWidth + expectedDocStartFPBitWidth; + // With bit-per-byte bytes dataBytes = - new BytesRef( - bitPerBytePacker.getBytes(), - pos * (expectedDocFreqBitWidth + expectedDocStartFPBitWidth) - startBitIndex, - expectedDocFreqBitWidth + expectedDocStartFPBitWidth); - IntBlockTermState decoded = - codec.decodeAt(metadataBytes, dataBytes, bitPerBytePacker, startBitIndex); - assertEquals(termStatesArray[pos].docFreq, decoded.docFreq); - assertEquals(termStatesArray[pos].docStartFP, decoded.docStartFP); + new BytesRef(bitPerBytePacker.getBytes(), pos * recordSize - startBitIndex, recordSize); + assertDecodeAt( + codec, metadataBytes, dataBytes, bitPerBytePacker, startBitIndex, termStatesArray[pos]); + + // With compact bytes + int startByteIndex = pos * recordSize / 8; + int endByteIndex = (pos + 1) * recordSize / 8; + int length = endByteIndex - startByteIndex + ((pos + 1) * recordSize % 8 == 0 ? 0 : 1); + dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes(), startByteIndex, length); + assertDecodeAt( + codec, + metadataBytes, + dataBytes, + BitUnpackerImpl.INSTANCE, + (pos * recordSize) % 8, + termStatesArray[pos]); } -} - -/** - * A wasteful bit packer that use whole byte to keep a bit. Useful for tests. It uses little-endian - * bit order. - */ -class BitPerBytePacker implements BitPacker, BitUnpacker { - private final ArrayList buffer = new ArrayList<>(); - - private int totalNumBits = 0; - @Override - public void add(long value, int numBits) { - assert numBits < 64; - totalNumBits += numBits; - while (numBits-- > 0) { - byte b = (byte) (value & 1L); - value = value >>> 1; - buffer.add(b); - } - } - - public byte[] getBytes() { - byte[] bytes = new byte[totalNumBits]; - int index = 0; - for (var b : buffer) { - bytes[index++] = b; - } - - return bytes; + private static void assertDecodeAt( + TermStateCodecImpl codec, + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker, + int startBitIndex, + IntBlockTermState termState) { + IntBlockTermState decoded = + codec.decodeAt(metadataBytes, dataBytes, bitUnpacker, startBitIndex); + assertEquals(termState.docFreq, decoded.docFreq); + assertEquals(termState.docStartFP, decoded.docStartFP); } - @Override - public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) { - long res = 0; - for (int i = 0; i < bitWidth; i++) { - res |= ((long) (bytesRef.bytes[bytesRef.offset + startBitIndex + i] & 1)) << i; + private static void assertBlockRoundTrip( + IntBlockTermState[] termStatesArray, + TermStateCodecImpl codec, + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker) { + for (int i = 0; i < termStatesArray.length; i++) { + IntBlockTermState decoded = codec.decodeWithinBlock(metadataBytes, dataBytes, bitUnpacker, i); + assertEquals(termStatesArray[i].docFreq, decoded.docFreq); + assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP); } - return res; } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java new file mode 100644 index 000000000000..a1a972d5ceb7 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; + +import java.util.ArrayList; +import org.apache.lucene.util.BytesRef; + +/** + * A wasteful bit packer that use whole byte to keep a bit. Useful for tests. It uses little-endian + * bit order. + */ +public class BitPerBytePacker implements BitPacker, BitUnpacker { + private final ArrayList buffer = new ArrayList<>(); + + private int totalNumBits = 0; + + @Override + public void add(long value, int numBits) { + assert numBits < 64; + totalNumBits += numBits; + while (numBits-- > 0) { + byte b = (byte) (value & 1L); + value = value >>> 1; + buffer.add(b); + } + } + + public byte[] getBytes() { + byte[] bytes = new byte[totalNumBits]; + int index = 0; + for (var b : buffer) { + bytes[index++] = b; + } + + return bytes; + } + + public byte[] getCompactBytes() { + int len = (totalNumBits - 1) / 8 + 1; // round up + byte[] bytes = new byte[len]; + + int remainingBits = totalNumBits; + int pos = 0; + while (remainingBits >= 8) { + byte b = 0; + int base = pos * 8; + for (int i = 0; i < 8; i++) { + b |= (byte) ((buffer.get(base + i) & 1) << i); + } + bytes[pos++] = b; + remainingBits -= 8; + } + + if (remainingBits > 0) { + byte b = 0; + int base = pos * 8; + for (int i = 0; i < remainingBits; i++) { + b |= (byte) ((buffer.get(base + i) & 1) << i); + } + bytes[pos] = b; + } + + return bytes; + } + + @Override + public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) { + long res = 0; + for (int i = 0; i < bitWidth; i++) { + res |= ((long) (bytesRef.bytes[bytesRef.offset + startBitIndex + i] & 1)) << i; + } + return res; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java new file mode 100644 index 000000000000..7c493a661da1 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestBitUnpackerImpl extends LuceneTestCase { + + public void testUnpackBasics() { + byte[] bytes = new byte[] {0x21, 0x43, 0x65, (byte) 0x87, (byte) 0xA9}; + BytesRef bytesRef = new BytesRef(bytes); + + for (int i = 1; i <= 10; i++) { + long val = BitUnpackerImpl.INSTANCE.unpack(bytesRef, (i - 1) * 4, 4); + assertEquals((long) i, val); + } + } + + public void testRandom() { + ValueAndBitWidth[] expected = + random() + .longs(1000, 0, Long.MAX_VALUE) + .mapToObj( + val -> { + int bitWidth = random().nextInt(1, 64); + val &= (1L << bitWidth) - 1; + return new ValueAndBitWidth(val, bitWidth); + }) + .toArray(ValueAndBitWidth[]::new); + + BitPerBytePacker referencePacker = new BitPerBytePacker(); + for (var x : expected) { + referencePacker.add(x.value, x.bitWidth); + } + + BytesRef bytes = new BytesRef(referencePacker.getCompactBytes()); + int startBitIndex = 0; + for (var x : expected) { + long unpacked = BitUnpackerImpl.INSTANCE.unpack(bytes, startBitIndex, x.bitWidth); + startBitIndex += x.bitWidth; + assertEquals(x.value, unpacked); + } + } + + private record ValueAndBitWidth(long value, int bitWidth) {} +} From a90f6085facf364487d479c08497dcac3962043e Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 6 Nov 2023 12:22:00 -0800 Subject: [PATCH 14/57] Fix typo and improve error reporting For those classes * TermType * TermsIndexBuilder --- .../sandbox/codecs/lucene90/randomaccess/TermType.java | 2 +- .../codecs/lucene90/randomaccess/TermsIndexBuilder.java | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java index d52cace8545d..793850a931fb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java @@ -24,7 +24,7 @@ * TermType holds the classification of a term, based on how its postings are written. * *

It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2) - * if the term has skip data. 3) if the term as an VINT encoded position block. + * if the term has skip data. 3) if the term has an VINT encoded position block. */ final class TermType { private static final byte SINGLETON_DOC_MASK = (byte) 1; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java index 8077de7682ce..1fea443c7c16 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java @@ -57,11 +57,13 @@ private long encode(long ord, TermType termType) { // so it looks like this |... ord ...| termType| ... hasOutput ...| // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord if (ord < 0) { - throw new IllegalArgumentException("can't encode negative ord"); + throw new IllegalArgumentException("can't encode negative ord: " + ord); } if (ord > MAX_ORD) { throw new IllegalArgumentException( - "Input ord is too large for TermType: " + "Input ord " + + ord + + " is too large for TermType: " + termType.getId() + ", max ord allowed is 2^60 - 1"); } From a5160abdcdbfbd00ce664c23af207971c5e98ac3 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 6 Nov 2023 14:19:23 -0800 Subject: [PATCH 15/57] Rename the module from lucene90.* to lucene99.* to reflect upstream Codec change --- lucene/sandbox/src/java/module-info.java | 4 ++-- ...RandomAccessDictionaryPostingsFormat.java} | 22 +++++++++---------- .../Lucene99RandomAccessTermsReader.java} | 4 ++-- .../Lucene99RandomAccessTermsWriter.java} | 4 ++-- .../randomaccess/TermStateCodec.java | 8 +++---- .../randomaccess/TermStateCodecComponent.java | 4 ++-- .../randomaccess/TermStateCodecImpl.java | 8 +++---- .../randomaccess/TermType.java | 4 ++-- .../randomaccess/TermsIndex.java | 2 +- .../randomaccess/TermsIndexBuilder.java | 2 +- .../randomaccess/bitpacking/BitPacker.java | 2 +- .../randomaccess/bitpacking/BitUnpacker.java | 2 +- .../bitpacking/BitUnpackerImpl.java | 2 +- .../randomaccess/bitpacking/package-info.java | 2 +- .../randomaccess/package-info.java | 4 ++-- .../TestTermStateCodecComponent.java | 4 ++-- .../randomaccess/TestTermStateCodecImpl.java | 10 ++++----- .../randomaccess/TestTermsIndexBuilder.java | 2 +- .../bitpacking/BitPerBytePacker.java | 2 +- .../bitpacking/TestBitUnpackerImpl.java | 2 +- 20 files changed, 47 insertions(+), 47 deletions(-) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java => lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java} (76%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90/randomaccess/Lucene90RandomAccessTermsReader.java => lucene99/randomaccess/Lucene99RandomAccessTermsReader.java} (91%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java => lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java} (89%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermStateCodec.java (90%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermStateCodecComponent.java (97%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermStateCodecImpl.java (95%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermType.java (95%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermsIndex.java (93%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermsIndexBuilder.java (97%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitPacker.java (93%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitUnpacker.java (94%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitUnpackerImpl.java (97%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/package-info.java (93%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/package-info.java (90%) rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TestTermStateCodecComponent.java (95%) rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TestTermStateCodecImpl.java (94%) rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TestTermsIndexBuilder.java (97%) rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitPerBytePacker.java (97%) rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/TestBitUnpackerImpl.java (97%) diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 59331969cce1..45b66e7c353e 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -22,8 +22,8 @@ exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; - exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess; - exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; + exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; exports org.apache.lucene.sandbox.document; exports org.apache.lucene.sandbox.queries; exports org.apache.lucene.sandbox.search; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java similarity index 76% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java index 60c292706a30..59de10be73da 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; import org.apache.lucene.codecs.FieldsConsumer; @@ -22,26 +22,26 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsReader; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; /** - * Similar to {@link Lucene90PostingsFormat} but with a different term dictionary implementation. + * Similar to {@link Lucene99PostingsFormat} but with a different term dictionary implementation. * * @lucene.experimental */ -public final class Lucene90RandomAccessDictionaryPostingsFormat extends PostingsFormat { +public final class Lucene99RandomAccessDictionaryPostingsFormat extends PostingsFormat { // Increment version to change it static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; /** Creates {@code Lucene90RandomAccessDictionaryPostingsFormat} */ - public Lucene90RandomAccessDictionaryPostingsFormat() { + public Lucene99RandomAccessDictionaryPostingsFormat() { super("Lucene90RandomAccess"); } @@ -52,10 +52,10 @@ public String toString() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene90PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state); boolean success = false; try { - FieldsConsumer ret = new Lucene90RandomAccessTermsWriter(); + FieldsConsumer ret = new Lucene99RandomAccessTermsWriter(); success = true; return ret; } finally { @@ -67,10 +67,10 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene90PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene99PostingsReader(state); boolean success = false; try { - FieldsProducer ret = new Lucene90RandomAccessTermsReader(); + FieldsProducer ret = new Lucene99RandomAccessTermsReader(); success = true; return ret; } finally { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java similarity index 91% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java index d5214561bf26..79a63dccf265 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java @@ -15,14 +15,14 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; import java.util.Iterator; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.index.Terms; -class Lucene90RandomAccessTermsReader extends FieldsProducer { +class Lucene99RandomAccessTermsReader extends FieldsProducer { @Override public void close() throws IOException {} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java similarity index 89% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java index c18a0cbbd143..87b68d2b9c63 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java @@ -14,14 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.index.Fields; -class Lucene90RandomAccessTermsWriter extends FieldsConsumer { +class Lucene99RandomAccessTermsWriter extends FieldsConsumer { @Override public void write(Fields fields, NormsProducer norms) throws IOException {} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java similarity index 90% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java index a203bdc180e0..a28fb1a94b65 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java @@ -15,11 +15,11 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; import org.apache.lucene.util.BytesRef; interface TermStateCodec { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java similarity index 97% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java index 9d93f40dc4b0..0740f44ae720 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java @@ -15,9 +15,9 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; abstract class TermStateCodecComponent { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java similarity index 95% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index eea9e1b149a8..32ccde2fe286 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -15,11 +15,11 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.util.BytesRef; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java similarity index 95% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java index 793850a931fb..c7fbd6089527 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java @@ -15,10 +15,10 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.util.Objects; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; /** * TermType holds the classification of a term, based on how its postings are written. diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java similarity index 93% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index 94fce6559bc4..a4b67f527275 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import org.apache.lucene.util.fst.FST; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java similarity index 97% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java index 1fea443c7c16..9484a0505458 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; import java.util.Arrays; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java similarity index 93% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java index a1828c69a032..a06ca746d245 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; /** Interface for bit-packing */ public interface BitPacker { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java similarity index 94% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java index 7c9448d893b5..b5af7b40e385 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; import org.apache.lucene.util.BytesRef; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java similarity index 97% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java index b4cfd54f584a..44fa6af19887 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; import org.apache.lucene.util.BytesRef; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java similarity index 93% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java index 866d071788ac..8a9078ffa33c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java @@ -16,4 +16,4 @@ */ /** Code for packing and unpacking sequence of non-negative integers with smaller bit width. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java similarity index 90% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java index d5cf9583f91c..a85027e3b5e1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java @@ -16,7 +16,7 @@ */ /** - * A PostingFormat that is based on {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat} + * A PostingFormat that is based on {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat} * but provides random access term dictionary. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java similarity index 95% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java index 862996fb6c30..15a5e940986c 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java @@ -15,10 +15,10 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.util.stream.LongStream; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.tests.util.LuceneTestCase; public class TestTermStateCodecComponent extends LuceneTestCase { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java similarity index 94% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java index 298c4a4a419d..1b7a20fad427 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -15,13 +15,13 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.util.ArrayList; -import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPerBytePacker; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker; -import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpackerImpl; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java similarity index 97% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java index 43f4010b1ae6..4b5cf6e58b11 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; import java.util.HashMap; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java similarity index 97% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java index a1a972d5ceb7..37dec6131975 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; import java.util.ArrayList; import org.apache.lucene.util.BytesRef; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java similarity index 97% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java index 7c493a661da1..f5fc4d12c143 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; From ece7710ec0a9d7bf871c0590ab8183cb65e9822a Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 6 Nov 2023 16:25:45 -0800 Subject: [PATCH 16/57] Implement compact generic byte-oriented BitPacker Also with a concrete implementation based on fixed-size byte[] --- .../randomaccess/bitpacking/BitPacker.java | 3 + .../bitpacking/BitPackerImplBase.java | 64 +++++++++++++++++++ .../FixedSizeByteArrayBitPacker.java | 41 ++++++++++++ .../bitpacking/BitPerBytePacker.java | 5 ++ .../bitpacking/TestBitPackerImpl.java | 53 +++++++++++++++ .../bitpacking/TestBitUnpackerImpl.java | 21 ++---- .../bitpacking/ValueAndBitWidth.java | 35 ++++++++++ 7 files changed, 206 insertions(+), 16 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java index a06ca746d245..06dec80d70dc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java @@ -22,4 +22,7 @@ public interface BitPacker { /** Pack the low `numBits` bits of `value` */ void add(long value, int numBits); + + /** Flush any pending byte */ + void flush(); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java new file mode 100644 index 000000000000..329192ed2c82 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +/** + * Implementation of {@link BitPacker}. The behavior the is abstracted out here is how to write a + * byte. This is useful as we can wire the byte-writing to byte[], stream or IndexInput, etc. + */ +abstract class BitPackerImplBase implements BitPacker { + private long totalNumBytesWritten; + private byte buffer; + private int bufferNumBitsUsed; + + abstract void writeByte(byte b); + + /** {@inheritDoc}. value could be larger than 2^numBits - 1 but the higher bits won't be used. */ + @Override + public void add(long value, int numBits) { + assert numBits < 64; + // clear bits higher than `numBits` + value &= (1L << numBits) - 1; + + while (numBits > 0) { + int bufferNumBitsRemaining = 8 - bufferNumBitsUsed; + if (numBits < bufferNumBitsRemaining) { + buffer |= (byte) (value << bufferNumBitsUsed); + bufferNumBitsUsed += numBits; + break; + } else { + long mask = (1L << bufferNumBitsRemaining) - 1; + buffer |= (byte) ((value & mask) << bufferNumBitsUsed); + numBits -= bufferNumBitsRemaining; + value >>>= bufferNumBitsRemaining; + writeByte(buffer); + totalNumBytesWritten += 1; + buffer = 0; + bufferNumBitsUsed = 0; + } + } + } + + @Override + public void flush() { + if (bufferNumBitsUsed > 0) { + writeByte(buffer); + bufferNumBitsUsed = 0; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java new file mode 100644 index 000000000000..a8be9aca89bd --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +/** + * A {@link BitPacker} implementation that requires user to know the size of the resulting byte + * array upfront, in order to avoid allocation and copying for dynamically growing the array. + */ +public final class FixedSizeByteArrayBitPacker extends BitPackerImplBase { + private final byte[] bytes; + private int numBytesUsed = 0; + + public FixedSizeByteArrayBitPacker(int capacity) { + this.bytes = new byte[capacity]; + } + + @Override + void writeByte(byte b) { + assert numBytesUsed < bytes.length; + bytes[numBytesUsed++] = b; + } + + public byte[] getBytes() { + return bytes; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java index 37dec6131975..2df2a74907e2 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java @@ -40,6 +40,11 @@ public void add(long value, int numBits) { } } + @Override + public void flush() { + // No-op as this impl writes a byte per bit + } + public byte[] getBytes() { byte[] bytes = new byte[totalNumBits]; int index = 0; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java new file mode 100644 index 000000000000..84ae93fe4e52 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.util.Arrays; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestBitPackerImpl extends LuceneTestCase { + + public void testBasic() { + FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = new FixedSizeByteArrayBitPacker(5); + for (int i = 1; i <= 10; i++) { + fixedSizeByteArrayBitPacker.add(i, 4); + } + fixedSizeByteArrayBitPacker.flush(); + + byte[] expectedBytes = new byte[] {0x21, 0x43, 0x65, (byte) 0x87, (byte) 0xA9}; + assertArrayEquals(expectedBytes, fixedSizeByteArrayBitPacker.getBytes()); + } + + public void testRandom() { + ValueAndBitWidth[] randomInputs = ValueAndBitWidth.getRandomArray(random(), 1000); + int totalNumberBits = Arrays.stream(randomInputs).mapToInt(ValueAndBitWidth::bitWidth).sum(); + + BitPerBytePacker referencePacker = new BitPerBytePacker(); + int capacity = totalNumberBits / 8 + (totalNumberBits % 8 == 0 ? 0 : 1); + FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = + new FixedSizeByteArrayBitPacker(capacity); + + for (ValueAndBitWidth x : randomInputs) { + referencePacker.add(x.value(), x.bitWidth()); + fixedSizeByteArrayBitPacker.add(x.value(), x.bitWidth()); + } + referencePacker.flush(); + fixedSizeByteArrayBitPacker.flush(); + assertArrayEquals(referencePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes()); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java index f5fc4d12c143..2cc106b669e2 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java @@ -33,30 +33,19 @@ public void testUnpackBasics() { } public void testRandom() { - ValueAndBitWidth[] expected = - random() - .longs(1000, 0, Long.MAX_VALUE) - .mapToObj( - val -> { - int bitWidth = random().nextInt(1, 64); - val &= (1L << bitWidth) - 1; - return new ValueAndBitWidth(val, bitWidth); - }) - .toArray(ValueAndBitWidth[]::new); + ValueAndBitWidth[] expected = ValueAndBitWidth.getRandomArray(random(), 1000); BitPerBytePacker referencePacker = new BitPerBytePacker(); for (var x : expected) { - referencePacker.add(x.value, x.bitWidth); + referencePacker.add(x.value(), x.bitWidth()); } BytesRef bytes = new BytesRef(referencePacker.getCompactBytes()); int startBitIndex = 0; for (var x : expected) { - long unpacked = BitUnpackerImpl.INSTANCE.unpack(bytes, startBitIndex, x.bitWidth); - startBitIndex += x.bitWidth; - assertEquals(x.value, unpacked); + long unpacked = BitUnpackerImpl.INSTANCE.unpack(bytes, startBitIndex, x.bitWidth()); + startBitIndex += x.bitWidth(); + assertEquals(x.value(), unpacked); } } - - private record ValueAndBitWidth(long value, int bitWidth) {} } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java new file mode 100644 index 000000000000..40bee28660fa --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.util.Random; + +record ValueAndBitWidth(long value, int bitWidth) { + + static ValueAndBitWidth[] getRandomArray(Random random, int size) { + return random + .longs(size, 0, Long.MAX_VALUE) + .mapToObj( + val -> { + int bitWidth = random.nextInt(1, 64); + val &= (1L << bitWidth) - 1; + return new ValueAndBitWidth(val, bitWidth); + }) + .toArray(ValueAndBitWidth[]::new); + } +} From 0f3b5a17d8c44e738f003f1ab9a733b37235772b Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 6 Nov 2023 16:30:58 -0800 Subject: [PATCH 17/57] Fix issues identfied by precommit checks https://github.com/apache/lucene/actions/runs/6777264120/job/18420607690?pr=12688 --- .../lucene99/randomaccess/bitpacking/BitUnpackerImpl.java | 1 + .../codecs/lucene99/randomaccess/TestTermStateCodecImpl.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java index 44fa6af19887..84704c0b8787 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java @@ -19,6 +19,7 @@ import org.apache.lucene.util.BytesRef; +/** Implementation of {@link BitUnpacker} that works with compactly packed bits */ public class BitUnpackerImpl implements BitUnpacker { public static BitUnpackerImpl INSTANCE = new BitUnpackerImpl(); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java index 1b7a20fad427..bd9249d48b95 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -82,7 +82,7 @@ public void testEncodeDecode() { // Also test decoding that doesn't begin at the start of the block. int pos = random().nextInt(termStatesArray.length); - int startBitIndex = random().nextInt(pos); + int startBitIndex = pos > 0 ? random().nextInt(pos) : 0; int recordSize = expectedDocFreqBitWidth + expectedDocStartFPBitWidth; // With bit-per-byte bytes dataBytes = From cc0751fc78e1a633a319371d0e7b7419f26fee9a Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 6 Nov 2023 19:43:19 -0800 Subject: [PATCH 18/57] Remove unused member field `totalNumBytesWritten` --- .../lucene99/randomaccess/bitpacking/BitPackerImplBase.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java index 329192ed2c82..5d5aea06dc57 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java @@ -22,7 +22,6 @@ * byte. This is useful as we can wire the byte-writing to byte[], stream or IndexInput, etc. */ abstract class BitPackerImplBase implements BitPacker { - private long totalNumBytesWritten; private byte buffer; private int bufferNumBitsUsed; @@ -47,7 +46,6 @@ public void add(long value, int numBits) { numBits -= bufferNumBitsRemaining; value >>>= bufferNumBitsRemaining; writeByte(buffer); - totalNumBytesWritten += 1; buffer = 0; bufferNumBitsUsed = 0; } From 39e9e08f54b130974a457ec8827e0635f064b3d0 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 6 Nov 2023 21:59:43 -0800 Subject: [PATCH 19/57] Test TermStateCodecImpl with real compact bit-packer --- .../codecs/lucene99/randomaccess/TermStateCodecImpl.java | 1 + .../lucene99/randomaccess/TestTermStateCodecImpl.java | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index 32ccde2fe286..fc3f9e5e1545 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -53,6 +53,7 @@ public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) { for (var termState : inputs) { encodeOne(bitPacker, termState, metadataPerComponent); } + bitPacker.flush(); return metadataBytes; } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java index bd9249d48b95..624445515459 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -22,6 +22,7 @@ import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.FixedSizeByteArrayBitPacker; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; @@ -70,6 +71,12 @@ public void testEncodeDecode() { ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8); assertEquals(docStartFPBase, byteArrayDataInput.readLong()); + // Assert with real bit-packer we get the same bytes + FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = + new FixedSizeByteArrayBitPacker(bitPerBytePacker.getCompactBytes().length); + codec.encodeBlock(termStatesArray, fixedSizeByteArrayBitPacker); + assertArrayEquals(bitPerBytePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes()); + // Assert that each term state is the same after the encode-decode roundtrip. BytesRef metadataBytes = new BytesRef(metadata); BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes()); From 12f9c836f0f5dc5325b391e6b9da5002a49e16c3 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Tue, 7 Nov 2023 14:55:49 -0800 Subject: [PATCH 20/57] Implement TermStateCodecImpl.getCodec for (TermType, IndexOptions) --- .../randomaccess/TermStateCodecComponent.java | 42 ++++- .../randomaccess/TermStateCodecImpl.java | 84 +++++++++- .../lucene99/randomaccess/TermType.java | 8 +- .../randomaccess/TestTermStateCodecImpl.java | 144 ++++++++++++++++++ 4 files changed, 261 insertions(+), 17 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java index 0740f44ae720..8db1c4e81144 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java @@ -20,6 +20,16 @@ import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; abstract class TermStateCodecComponent { + private final String name; + + TermStateCodecComponent(String name) { + this.name = name; + } + + @Override + public String toString() { + return "TermStateCodecComponent{" + "name='" + name + '\'' + '}'; + } static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) { assert termStates.length > 0; @@ -44,7 +54,9 @@ static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent static final class SingletonDocId extends TermStateCodecComponent { public static SingletonDocId INSTANCE = new SingletonDocId(); - private SingletonDocId() {} + private SingletonDocId() { + super("SingletonDocId"); + } @Override public boolean isMonotonicallyIncreasing() { @@ -68,7 +80,9 @@ public void setTargetValue(IntBlockTermState termState, long value) { static final class DocFreq extends TermStateCodecComponent { public static DocFreq INSTANCE = new DocFreq(); - private DocFreq() {} + private DocFreq() { + super("DocFreq"); + } @Override public boolean isMonotonicallyIncreasing() { @@ -92,7 +106,9 @@ public void setTargetValue(IntBlockTermState termState, long value) { static final class TotalTermFreq extends TermStateCodecComponent { public static TotalTermFreq INSTANCE = new TotalTermFreq(); - private TotalTermFreq() {} + private TotalTermFreq() { + super("TotalTermFreq"); + } @Override public boolean isMonotonicallyIncreasing() { @@ -113,7 +129,9 @@ public void setTargetValue(IntBlockTermState termState, long value) { static final class DocStartFP extends TermStateCodecComponent { public static DocStartFP INSTANCE = new DocStartFP(); - private DocStartFP() {} + private DocStartFP() { + super("DocStartFP"); + } @Override public boolean isMonotonicallyIncreasing() { @@ -134,7 +152,9 @@ public void setTargetValue(IntBlockTermState termState, long value) { static final class PositionStartFP extends TermStateCodecComponent { public static PositionStartFP INSTANCE = new PositionStartFP(); - private PositionStartFP() {} + private PositionStartFP() { + super("PositionStartFP"); + } @Override public boolean isMonotonicallyIncreasing() { @@ -155,7 +175,9 @@ public void setTargetValue(IntBlockTermState termState, long value) { static final class PayloadStartFP extends TermStateCodecComponent { public static PayloadStartFP INSTANCE = new PayloadStartFP(); - private PayloadStartFP() {} + private PayloadStartFP() { + super("PayloadStartFP"); + } @Override public boolean isMonotonicallyIncreasing() { @@ -176,7 +198,9 @@ public void setTargetValue(IntBlockTermState termState, long value) { static final class SkipOffset extends TermStateCodecComponent { public static SkipOffset INSTANCE = new SkipOffset(); - private SkipOffset() {} + private SkipOffset() { + super("SkipOffset"); + } @Override public boolean isMonotonicallyIncreasing() { @@ -197,7 +221,9 @@ public void setTargetValue(IntBlockTermState termState, long value) { static final class LastPositionBlockOffset extends TermStateCodecComponent { public static LastPositionBlockOffset INSTANCE = new LastPositionBlockOffset(); - private LastPositionBlockOffset() {} + private LastPositionBlockOffset() { + super("LastPositionBlockOffset"); + } @Override public boolean isMonotonicallyIncreasing() { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index fc3f9e5e1545..061f1c866a7b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -17,7 +17,18 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.util.ArrayList; +import java.util.Arrays; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.DocFreq; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.DocStartFP; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.LastPositionBlockOffset; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.PayloadStartFP; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.PositionStartFP; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.SingletonDocId; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.SkipOffset; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.TotalTermFreq; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; import org.apache.lucene.store.ByteArrayDataInput; @@ -28,11 +39,6 @@ final class TermStateCodecImpl implements TermStateCodec { private final TermStateCodecComponent[] components; private final int metadataBytesLength; - private static int getMetadataLength(TermStateCodecComponent component) { - // 1 byte for bitWidth; optionally 8 byte more for the reference value - return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0); - } - public TermStateCodecImpl(TermStateCodecComponent[] components) { assert components.length > 0; @@ -44,6 +50,74 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) { this.metadataBytesLength = metadataBytesLength; } + private static int getMetadataLength(TermStateCodecComponent component) { + // 1 byte for bitWidth; optionally 8 byte more for the reference value + return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0); + } + + public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexOptions) { + assert indexOptions.ordinal() > IndexOptions.NONE.ordinal(); + // A term can't have skip data (has more than one block's worth of doc), + // while having a singleton doc at the same time! + assert !(termType.hasSkipData() && termType.hasSingletonDoc()); + + ArrayList components = new ArrayList<>(); + // handle docs + if (termType.hasSingletonDoc()) { + components.add(SingletonDocId.INSTANCE); + } else { + components.add(DocStartFP.INSTANCE); + } + // handle skip data + if (termType.hasSkipData()) { + components.add(SkipOffset.INSTANCE); + } + // handle docFreq + boolean totalTermFeqAdded = false; + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + if (termType.hasSingletonDoc()) { + components.add(TotalTermFreq.INSTANCE); + totalTermFeqAdded = true; + } else { + components.add(DocFreq.INSTANCE); + } + } + // handle positions + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + if (!totalTermFeqAdded) { + components.add(TotalTermFreq.INSTANCE); + } + components.add(PositionStartFP.INSTANCE); + if (termType.hasLastPositionBlockOffset()) { + components.add(LastPositionBlockOffset.INSTANCE); + } + } + // handle payload and offsets + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(PayloadStartFP.INSTANCE); + } + + return new TermStateCodecImpl(components.toArray(TermStateCodecComponent[]::new)); + } + + @Override + public String toString() { + return "TermStateCodecImpl{" + "components=" + Arrays.toString(components) + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TermStateCodecImpl that = (TermStateCodecImpl) o; + return Arrays.equals(components, that.components); + } + + @Override + public int hashCode() { + return Arrays.hashCode(components); + } + @Override public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) { Metadata[] metadataPerComponent = getMetadataPerComponent(inputs); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java index c7fbd6089527..81e66540c08e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java @@ -31,7 +31,7 @@ final class TermType { private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1; - private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2; + private static final byte HAS_LAST_POSITION_BLOCK_OFFEST_MASK = (byte) 1 << 2; public static final int NUM_TOTAL_TYPES = 8; @@ -54,8 +54,8 @@ boolean hasSkipData() { return (this.flag & HAS_SKIP_DATA_MASK) > 0; } - boolean hasVintPositionBlock() { - return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0; + boolean hasLastPositionBlockOffset() { + return (this.flag & HAS_LAST_POSITION_BLOCK_OFFEST_MASK) > 0; } static TermType fromTermState(IntBlockTermState state) { @@ -67,7 +67,7 @@ static TermType fromTermState(IntBlockTermState state) { flag |= HAS_SKIP_DATA_MASK; } if (state.lastPosBlockOffset != -1) { - flag |= HAS_VINT_POSITION_BLOCK_MASK; + flag |= HAS_LAST_POSITION_BLOCK_OFFEST_MASK; } return new TermType(flag); } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java index 624445515459..175ac30e9407 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; @@ -136,4 +137,147 @@ private static void assertBlockRoundTrip( assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP); } } + + public void testGetCodec() { + for (IndexOptions indexOptions : IndexOptions.values()) { + if (indexOptions == IndexOptions.NONE) { + continue; + } + for (int i = 0; i < 8; i++) { + if ((i & 0b011) == 0b011) { + continue; + } + if ((i & 0b100) == 0b100 + && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + continue; + } + TermType termType = TermType.fromId(i); + var expected = getExpectedCodec(termType, indexOptions); + var got = TermStateCodecImpl.getCodec(termType, indexOptions); + assertEquals(expected, got); + } + } + } + + // Enumerate the expected Codec we get for (TermType, IndexOptions) pairs. + static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions indexOptions) { + ArrayList components = new ArrayList<>(); + // Wish I can code this better in java... + switch (termType.getId()) { + // Not singleton doc; No skip data; No last position block offset + case 0b000 -> { + assert !termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && !termType.hasSingletonDoc(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + } + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + } + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Singleton doc; No skip data; No last position block offset + case 0b001 -> { + assert !termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && termType.hasSingletonDoc(); + components.add(TermStateCodecComponent.SingletonDocId.INSTANCE); + // If field needs frequency, we need totalTermsFreq. + // Since there is only 1 doc, totalTermsFreq == docFreq. + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + } + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + } + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + + // Not Singleton doc; Has skip data; No last position block offset + case 0b010 -> { + assert !termType.hasLastPositionBlockOffset() + && termType.hasSkipData() + && !termType.hasSingletonDoc(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.SkipOffset.INSTANCE); + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + } + if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + } + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Singleton doc but has skip data; Invalid state. + case 0b011, 0b111 -> { + assert termType.hasSkipData() && termType.hasSingletonDoc(); + throw new IllegalStateException( + "Unreachable. A term has skip data but also only has one doc!? Must be a bug"); + } + // Not singleton doc; No skip data; Has last position block offset; + case 0b100 -> { + assert termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && !termType.hasSingletonDoc(); + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Singleton doc; No skip data; Has last position block offset; + case 0b101 -> { + assert termType.hasLastPositionBlockOffset() + && !termType.hasSkipData() + && termType.hasSingletonDoc(); + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + components.add(TermStateCodecComponent.SingletonDocId.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + // Not singleton doc; Has skip data; Has last position block offset; + case 0b110 -> { + assert termType.hasLastPositionBlockOffset() + && termType.hasSkipData() + && !termType.hasSingletonDoc(); + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.SkipOffset.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); + components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); + if (indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } + } + default -> throw new IllegalStateException("Unreachable"); + } + + return new TermStateCodecImpl(components.toArray(TermStateCodecComponent[]::new)); + } } From 402965ff5d453754ec02cd15ba5ecad78c63db39 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 9 Nov 2023 13:42:35 -0800 Subject: [PATCH 21/57] Implement term (type, ord) lookup in TermsIndex --- .../codecs/lucene99/randomaccess/TermsIndex.java | 15 ++++++++++++++- .../randomaccess/TestTermsIndexBuilder.java | 11 +++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index a4b67f527275..e43e495a48ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -17,6 +17,19 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.io.IOException; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; -record TermsIndex(FST fst) {} +record TermsIndex(FST fst) { + + TypeAndOrd getTerm(BytesRef term) throws IOException { + long encoded = Util.get(fst, term); + TermType termType = TermType.fromId((int) ((encoded & 0b1110L) >>> 1)); + long ord = encoded >>> 4; + return new TypeAndOrd(termType, ord); + } + + public record TypeAndOrd(TermType termType, long ord) {} +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java index 4b5cf6e58b11..7179c23d1d7e 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java @@ -22,8 +22,6 @@ import java.util.Map; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Util; public class TestTermsIndexBuilder extends LuceneTestCase { @@ -51,15 +49,12 @@ public void testBasics() throws IOException { } TermsIndex termsIndex = builder.build(); - FST fst = termsIndex.fst(); - for (String term : test_terms) { BytesRef termBytes = new BytesRef(term); - long encoded = Util.get(fst, termBytes); + TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(termBytes); - assertEquals(1L, encoded & 0b1L); - assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1); - assertEquals((long) termsToOrd.get(term), encoded >> 4); + assertEquals(termsToType.get(term).intValue(), typeAndOrd.termType().getId()); + assertEquals((long) termsToOrd.get(term), typeAndOrd.ord()); } } } From 3ce5ea9196e1ea56e69d228cbf2d777a5a27f114 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 13 Nov 2023 10:54:47 -0800 Subject: [PATCH 22/57] create sub-package `termdict` to hold term dictionary implementions --- lucene/sandbox/src/java/module-info.java | 1 + .../codecs/lucene99/randomaccess/TermType.java | 14 +++++++------- .../{ => termdict}/TermsIndex.java | 3 ++- .../{ => termdict}/TermsIndexBuilder.java | 3 ++- .../randomaccess/termdict/package-info.java | 18 ++++++++++++++++++ .../{ => termdict}/TestTermsIndexBuilder.java | 3 ++- 6 files changed, 32 insertions(+), 10 deletions(-) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{ => termdict}/TermsIndex.java (89%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{ => termdict}/TermsIndexBuilder.java (95%) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{ => termdict}/TestTermsIndexBuilder.java (93%) diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 45b66e7c353e..2d9d6d31fc65 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -28,6 +28,7 @@ exports org.apache.lucene.sandbox.queries; exports org.apache.lucene.sandbox.search; exports org.apache.lucene.sandbox.index; + exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java index 81e66540c08e..69e1150391a2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java @@ -26,7 +26,7 @@ *

It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2) * if the term has skip data. 3) if the term has an VINT encoded position block. */ -final class TermType { +public final class TermType { private static final byte SINGLETON_DOC_MASK = (byte) 1; private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1; @@ -41,24 +41,24 @@ private TermType(byte flag) { this.flag = flag; } - int getId() { + public int getId() { assert this.flag >= 0 && this.flag <= 8; return this.flag; } - boolean hasSingletonDoc() { + public boolean hasSingletonDoc() { return (this.flag & SINGLETON_DOC_MASK) > 0; } - boolean hasSkipData() { + public boolean hasSkipData() { return (this.flag & HAS_SKIP_DATA_MASK) > 0; } - boolean hasLastPositionBlockOffset() { + public boolean hasLastPositionBlockOffset() { return (this.flag & HAS_LAST_POSITION_BLOCK_OFFEST_MASK) > 0; } - static TermType fromTermState(IntBlockTermState state) { + public static TermType fromTermState(IntBlockTermState state) { byte flag = 0; if (state.singletonDocID != -1) { flag |= SINGLETON_DOC_MASK; @@ -72,7 +72,7 @@ static TermType fromTermState(IntBlockTermState state) { return new TermType(flag); } - static TermType fromId(int id) { + public static TermType fromId(int id) { if (id < 0 || id > 8) { throw new IllegalArgumentException("id must be within range [0, 8]"); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java similarity index 89% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java index e43e495a48ef..0c2035c6e715 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java @@ -15,9 +15,10 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; import java.io.IOException; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java similarity index 95% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java index 9484a0505458..a49d00566bd2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java @@ -15,10 +15,11 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; import java.io.IOException; import java.util.Arrays; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java new file mode 100644 index 000000000000..45b415b7f510 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** Class for term dictionary implementation. */ +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java similarity index 93% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java index 7179c23d1d7e..d1a665ed9867 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java @@ -15,11 +15,12 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; import java.io.IOException; import java.util.HashMap; import java.util.Map; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; From fd9beca452f0fef4870414c0e841d9656c642bd3 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 13 Nov 2023 13:54:27 -0800 Subject: [PATCH 23/57] Revert "create sub-package `termdict` to hold term dictionary implementions" This reverts commit 3ce5ea9196e1ea56e69d228cbf2d777a5a27f114. Reverting because I want to reduce what gets exposed to the rest of the project --- lucene/sandbox/src/java/module-info.java | 1 - .../codecs/lucene99/randomaccess/TermType.java | 14 +++++++------- .../{termdict => }/TermsIndex.java | 3 +-- .../{termdict => }/TermsIndexBuilder.java | 3 +-- .../randomaccess/termdict/package-info.java | 18 ------------------ .../{termdict => }/TestTermsIndexBuilder.java | 3 +-- 6 files changed, 10 insertions(+), 32 deletions(-) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{termdict => }/TermsIndex.java (89%) rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{termdict => }/TermsIndexBuilder.java (95%) delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{termdict => }/TestTermsIndexBuilder.java (93%) diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 2d9d6d31fc65..45b66e7c353e 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -28,7 +28,6 @@ exports org.apache.lucene.sandbox.queries; exports org.apache.lucene.sandbox.search; exports org.apache.lucene.sandbox.index; - exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java index 69e1150391a2..81e66540c08e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java @@ -26,7 +26,7 @@ *

It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2) * if the term has skip data. 3) if the term has an VINT encoded position block. */ -public final class TermType { +final class TermType { private static final byte SINGLETON_DOC_MASK = (byte) 1; private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1; @@ -41,24 +41,24 @@ private TermType(byte flag) { this.flag = flag; } - public int getId() { + int getId() { assert this.flag >= 0 && this.flag <= 8; return this.flag; } - public boolean hasSingletonDoc() { + boolean hasSingletonDoc() { return (this.flag & SINGLETON_DOC_MASK) > 0; } - public boolean hasSkipData() { + boolean hasSkipData() { return (this.flag & HAS_SKIP_DATA_MASK) > 0; } - public boolean hasLastPositionBlockOffset() { + boolean hasLastPositionBlockOffset() { return (this.flag & HAS_LAST_POSITION_BLOCK_OFFEST_MASK) > 0; } - public static TermType fromTermState(IntBlockTermState state) { + static TermType fromTermState(IntBlockTermState state) { byte flag = 0; if (state.singletonDocID != -1) { flag |= SINGLETON_DOC_MASK; @@ -72,7 +72,7 @@ public static TermType fromTermState(IntBlockTermState state) { return new TermType(flag); } - public static TermType fromId(int id) { + static TermType fromId(int id) { if (id < 0 || id > 8) { throw new IllegalArgumentException("id must be within range [0, 8]"); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java similarity index 89% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index 0c2035c6e715..e43e495a48ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -15,10 +15,9 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; -import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java similarity index 95% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java index a49d00566bd2..9484a0505458 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -15,11 +15,10 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; import java.util.Arrays; -import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java deleted file mode 100644 index 45b415b7f510..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** Class for term dictionary implementation. */ -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java similarity index 93% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java index d1a665ed9867..7179c23d1d7e 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java @@ -15,12 +15,11 @@ * limitations under the License. */ -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict; +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; From b3bf288dacea74e38abcf1a0d5e9256410b0fd17 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 16 Nov 2023 14:52:26 -0800 Subject: [PATCH 24/57] Setup sketch implementations for RandomAccessTermsDict This commit implemented following building blocks: * TermData -- represent the bitpacked termstate data. it exposes a get by term ordinal API. * TermDataWriter -- incrementally write termstate data as index files. misc. extended a few interfaces to expose information needed to implment term data lookup. --- .../randomaccess/ByteArrayByteSlice.java | 55 ++++++++ .../lucene99/randomaccess/ByteSlice.java | 32 +++++ .../RandomAccessInputByteSlice.java | 58 ++++++++ .../randomaccess/RandomAccessTermsDict.java | 22 ++++ .../lucene99/randomaccess/TermData.java | 94 +++++++++++++ .../lucene99/randomaccess/TermDataWriter.java | 94 +++++++++++++ .../lucene99/randomaccess/TermStateCodec.java | 20 ++- .../randomaccess/TermStateCodecComponent.java | 7 +- .../randomaccess/TermStateCodecImpl.java | 29 ++-- .../lucene99/randomaccess/TermsDataStore.java | 48 +++++++ .../lucene99/randomaccess/TermsImpl.java | 101 ++++++++++++++ .../lucene99/randomaccess/TermsIndex.java | 19 +++ .../lucene99/randomaccess/TermsStats.java | 29 ++++ .../randomaccess/bitpacking/BitPacker.java | 6 +- .../bitpacking/BitPackerImplBase.java | 8 +- .../bitpacking/DataOutputBitPacker.java | 44 +++++++ .../randomaccess/TestTermDataWriter.java | 124 ++++++++++++++++++ .../TestTermStateCodecComponent.java | 5 +- .../randomaccess/TestTermStateCodecImpl.java | 99 ++++++++------ .../bitpacking/BitPerBytePacker.java | 4 + .../bitpacking/TestBitPackerImpl.java | 5 +- 21 files changed, 846 insertions(+), 57 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java new file mode 100644 index 000000000000..55139ebf3a32 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.BitUtil; + +final class ByteArrayByteSlice implements ByteSlice { + private final byte[] bytes; + + ByteArrayByteSlice(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public long size() { + return bytes.length; + } + + @Override + public void writeAll(DataOutput output) throws IOException { + output.writeBytes(bytes, bytes.length); + } + + @Override + public long getLong(long pos) { + return (long) BitUtil.VH_LE_LONG.get(bytes, (int) pos); + } + + @Override + public byte[] getBytes(long pos, int length) { + if (length == 0) { + return new byte[0]; + } + byte[] result = new byte[length]; + System.arraycopy(bytes, (int) pos, result, 0, length); + return result; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java new file mode 100644 index 000000000000..937e915e3325 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** A slice of bytes */ +interface ByteSlice { + long size(); + + void writeAll(DataOutput output) throws IOException; + + long getLong(long pos) throws IOException; + + byte[] getBytes(long pos, int length) throws IOException; +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java new file mode 100644 index 000000000000..3d80e50dd383 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.RandomAccessInput; + +final class RandomAccessInputByteSlice implements ByteSlice { + private final RandomAccessInput randomAccessInput; + + RandomAccessInputByteSlice(RandomAccessInput randomAccessInput) { + this.randomAccessInput = randomAccessInput; + } + + @Override + public long size() { + return randomAccessInput.length(); + } + + @Override + public void writeAll(DataOutput output) throws IOException { + for (long pos = 0; pos < randomAccessInput.length(); pos++) { + // For buffered inputs and outputs this should be fine. + output.writeByte(randomAccessInput.readByte(pos)); + } + } + + @Override + public long getLong(long pos) throws IOException { + return randomAccessInput.readLong(pos); + } + + @Override + public byte[] getBytes(long pos, int length) throws IOException { + if (length == 0) { + return new byte[0]; + } + byte[] result = new byte[length]; + randomAccessInput.readBytes(pos, result, 0, length); + return result; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java new file mode 100644 index 000000000000..26451dd9f938 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +/** A term dictionary that offer random-access to read a specific term */ +record RandomAccessTermsDict( + TermsStats termsStats, TermsIndex termsIndex, TermsDataStore termsDataStore) {} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java new file mode 100644 index 000000000000..4e8f79738e59 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; + +/** + * Holds the bit-packed {@link IntBlockTermState} for a given {@link + * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType} + */ +record TermData(TermType termType, ByteSlice metadata, ByteSlice data) { + + IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException { + long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; + long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); + long dataStartPos = metadata.getLong(metadataStartPos); + BytesRef metadataBytesRef = + new BytesRef(metadata.getBytes(metadataStartPos + 8, codec.getMetadataBytesLength())); + + int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef); + int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK)); + int startBitIndex = dataBitIndex % 8; + int numBytesToRead = (startBitIndex + numBitsPerRecord) / 8; + if ((startBitIndex + numBitsPerRecord) % 8 > 0) { + numBytesToRead += 1; + } + BytesRef dataBytesRef = + new BytesRef(data.getBytes(dataStartPos + dataBitIndex / 8, numBytesToRead)); + + return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex); + } + + static TermData deserializeOnHeap( + DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException { + TermType termType = TermType.fromId(metaInput.readByte()); + long metadataSize = metaInput.readVLong(); + long dataSize = metaInput.readVLong(); + + if (metadataSize > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "Metadata size it too large to store on heap. Must be less than " + Integer.MAX_VALUE); + } + if (dataSize > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "Data size it too large to store on heap.Must be less than " + Integer.MAX_VALUE); + } + + byte[] metadataBytes = new byte[(int) metadataSize]; + byte[] dataBytes = new byte[(int) dataSize]; + + metadataInput.readBytes(metadataBytes, 0, metadataBytes.length); + dataInput.readBytes(dataBytes, 0, dataBytes.length); + + return new TermData( + termType, new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes)); + } + + static TermData deserializeOffHeap( + DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException { + TermType termType = TermType.fromId(metaInput.readByte()); + long metadataSize = metaInput.readVLong(); + long dataSize = metaInput.readVLong(); + + RandomAccessInput metadata = + metadataInput.randomAccessSlice(metadataInput.getFilePointer(), metadataSize); + metadataInput.skipBytes(metadataSize); + RandomAccessInput data = dataInput.randomAccessSlice(dataInput.getFilePointer(), dataSize); + dataInput.skipBytes(dataSize); + + return new TermData( + termType, new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data)); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java new file mode 100644 index 000000000000..09ab3cba9242 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.DataOutputBitPacker; +import org.apache.lucene.store.DataOutput; + +/** Writes TermData to */ +final class TermDataWriter { + static final int NUM_TERMS_PER_BLOCK = 256; + + private final TermStateCodec termStateCodec; + + private final IntBlockTermStateBuffer buffer = new IntBlockTermStateBuffer(NUM_TERMS_PER_BLOCK); + + private final DataOutput metadataOut; + private final DataOutputBitPacker dataOutputBitPacker; + + private long totalMetaDataBytesWritten; + + TermDataWriter(TermStateCodec termStateCodec, DataOutput metadataOut, DataOutput dataOut) { + this.termStateCodec = termStateCodec; + this.metadataOut = metadataOut; + this.dataOutputBitPacker = new DataOutputBitPacker(dataOut); + } + + void addTermState(IntBlockTermState termState) throws IOException { + buffer.add(termState); + if (buffer.numUsed == NUM_TERMS_PER_BLOCK) { + writeBlock(); + } + } + + void finish() throws IOException { + if (buffer.numUsed > 0) { + writeBlock(); + } + } + + long getTotalMetaDataBytesWritten() { + return totalMetaDataBytesWritten; + } + + long getTotalDataBytesWritten() { + return dataOutputBitPacker.getNumBytesWritten(); + } + + private void writeBlock() throws IOException { + metadataOut.writeLong(dataOutputBitPacker.getNumBytesWritten()); + byte[] metadata = + termStateCodec.encodeBlockUpTo(buffer.elements, buffer.numUsed, dataOutputBitPacker); + metadataOut.writeBytes(metadata, metadata.length); + totalMetaDataBytesWritten += metadata.length + 8; + buffer.clear(); + } + + /** act like a minial ArrayList, but provide access to the internal array */ + static class IntBlockTermStateBuffer { + IntBlockTermState[] elements; + int numUsed; + + IntBlockTermStateBuffer(int capacity) { + this.elements = new IntBlockTermState[capacity]; + } + + void add(IntBlockTermState termState) { + elements[numUsed++] = termState; + } + + void clear() { + for (int i = 0; i < numUsed; i++) { + elements[i] = null; + } + numUsed = 0; + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java index a28fb1a94b65..283512c7ae6a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java @@ -17,6 +17,7 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.io.IOException; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; @@ -24,13 +25,30 @@ interface TermStateCodec { + /** Get the number of bytes that the metadata per block needs. */ + int getMetadataBytesLength(); + + /** Get the number of bits per data record within the block, based on the provided metadata. */ + int getNumBitsPerRecord(BytesRef metadataBytes); + /** * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker into a block of * bytes. * * @return the metadata associated with the encoded bytes */ - byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker); + default byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) throws IOException { + return encodeBlockUpTo(inputs, inputs.length, bitPacker); + } + + /** + * Encode the sequence of {@link IntBlockTermState}s up to length, with the given bitPacker into a + * block of bytes. + * + * @return the metadata associated with the encoded bytes + */ + byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int upto, BitPacker bitPacker) + throws IOException; /** * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java index 8db1c4e81144..8545cce8e8c3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java @@ -31,14 +31,17 @@ public String toString() { return "TermStateCodecComponent{" + "name='" + name + '\'' + '}'; } - static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) { + static byte getBitWidth( + IntBlockTermState[] termStates, int upTo, TermStateCodecComponent component) { assert termStates.length > 0; + assert upTo > 0 && upTo <= termStates.length; long maxValSeen = -1; long referenceValue = component.isMonotonicallyIncreasing() ? component.getTargetValue(termStates[0]) : 0; - for (var termState : termStates) { + for (int i = 0; i < upTo; i++) { + var termState = termStates[i]; maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState) - referenceValue); } return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen)); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index 061f1c866a7b..3dc0a69f0c05 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -17,6 +17,7 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; @@ -50,6 +51,16 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) { this.metadataBytesLength = metadataBytesLength; } + @Override + public int getMetadataBytesLength() { + return metadataBytesLength; + } + + @Override + public int getNumBitsPerRecord(BytesRef metadataBytes) { + return deserializedMetadata(metadataBytes).totalBitsPerTermState; + } + private static int getMetadataLength(TermStateCodecComponent component) { // 1 byte for bitWidth; optionally 8 byte more for the reference value return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0); @@ -119,24 +130,25 @@ public int hashCode() { } @Override - public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) { - Metadata[] metadataPerComponent = getMetadataPerComponent(inputs); + public byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int uptop, BitPacker bitPacker) + throws IOException { + Metadata[] metadataPerComponent = getMetadataPerComponent(inputs, uptop); byte[] metadataBytes = serializeMetadata(metadataPerComponent); // Encode inputs via the bitpacker - for (var termState : inputs) { - encodeOne(bitPacker, termState, metadataPerComponent); + for (int i = 0; i < uptop; i++) { + encodeOne(bitPacker, inputs[i], metadataPerComponent); } bitPacker.flush(); return metadataBytes; } - private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs) { - Metadata[] metadataPerComponent = new Metadata[components.length]; + private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs, int upTo) { + Metadata[] metadataPerComponent = new Metadata[upTo]; for (int i = 0; i < components.length; i++) { var component = components[i]; - byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, component); + byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, upTo, component); long referenceValue = component.isMonotonicallyIncreasing() ? component.getTargetValue(inputs[0]) : 0L; metadataPerComponent[i] = new Metadata(bitWidth, referenceValue); @@ -159,7 +171,8 @@ private byte[] serializeMetadata(Metadata[] metadataPerComponent) { } private void encodeOne( - BitPacker bitPacker, IntBlockTermState termState, Metadata[] metadataPerComponent) { + BitPacker bitPacker, IntBlockTermState termState, Metadata[] metadataPerComponent) + throws IOException { for (int i = 0; i < components.length; i++) { var component = components[i]; var metadata = metadataPerComponent[i]; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java new file mode 100644 index 000000000000..1717d26aa780 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.util.Arrays; + +/** Holds all {@link TermData} for all {@link TermType} for a field. */ +class TermsDataStore { + private final TermData[] dataPerTermType; + + private TermsDataStore(TermData[] dataPerTermType) { + this.dataPerTermType = dataPerTermType; + } + + static class Builder { + private final TermData[] dataPerTermType; + + Builder() { + dataPerTermType = new TermData[TermType.NUM_TOTAL_TYPES]; + Arrays.fill(dataPerTermType, null); + } + + void add(TermData termData) { + assert dataPerTermType[termData.termType().getId()] == null; + + dataPerTermType[termData.termType().getId()] = termData; + } + + TermsDataStore build() { + return new TermsDataStore(dataPerTermType); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java new file mode 100644 index 000000000000..29a9c4124e7b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; + +class TermsImpl extends Terms { + private final FieldInfo fieldInfo; + + private final RandomAccessTermsDict termsDict; + + public TermsImpl(TermsStats stats, FieldInfo fieldInfo, RandomAccessTermsDict termsDict) { + this.fieldInfo = fieldInfo; + this.termsDict = termsDict; + } + + @Override + public long size() throws IOException { + return termsDict.termsStats().size(); + } + + @Override + public long getSumTotalTermFreq() throws IOException { + return termsDict.termsStats().sumTotalTermFreq(); + } + + @Override + public long getSumDocFreq() throws IOException { + return termsDict.termsStats().sumDocFreq(); + } + + @Override + public int getDocCount() throws IOException { + return termsDict.termsStats().docCount(); + } + + @Override + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal(); + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal(); + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public BytesRef getMin() throws IOException { + return termsDict.termsStats().minTerm(); + } + + @Override + public BytesRef getMax() throws IOException { + return termsDict.termsStats().maxTerm(); + } + + @Override + public TermsEnum iterator() throws IOException { + // TODO: implement me + return null; + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + // TODO: implement me + return null; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index e43e495a48ef..917989b51b43 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -18,8 +18,12 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; +import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; record TermsIndex(FST fst) { @@ -32,4 +36,19 @@ TypeAndOrd getTerm(BytesRef term) throws IOException { } public record TypeAndOrd(TermType termType, long ord) {} + + public void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException { + fst.save(metaOut, dataOut); + } + + public TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap) + throws IOException { + FST fst; + if (loadOffHeap) { + fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), new OffHeapFSTStore()); + } else { + fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton()); + } + return new TermsIndex(fst); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java new file mode 100644 index 000000000000..d5156d7455e9 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import org.apache.lucene.util.BytesRef; + +/** Data class that holds starts for term stats for a field */ +record TermsStats( + long size, + long sumTotalTermFreq, + long sumDocFreq, + int docCount, + BytesRef minTerm, + BytesRef maxTerm) {} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java index 06dec80d70dc..1ad8b0fb36e8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java @@ -17,12 +17,14 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; +import java.io.IOException; + /** Interface for bit-packing */ public interface BitPacker { /** Pack the low `numBits` bits of `value` */ - void add(long value, int numBits); + void add(long value, int numBits) throws IOException; /** Flush any pending byte */ - void flush(); + void flush() throws IOException; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java index 5d5aea06dc57..dc405c717072 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java @@ -17,6 +17,8 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; +import java.io.IOException; + /** * Implementation of {@link BitPacker}. The behavior the is abstracted out here is how to write a * byte. This is useful as we can wire the byte-writing to byte[], stream or IndexInput, etc. @@ -25,11 +27,11 @@ abstract class BitPackerImplBase implements BitPacker { private byte buffer; private int bufferNumBitsUsed; - abstract void writeByte(byte b); + abstract void writeByte(byte b) throws IOException; /** {@inheritDoc}. value could be larger than 2^numBits - 1 but the higher bits won't be used. */ @Override - public void add(long value, int numBits) { + public void add(long value, int numBits) throws IOException { assert numBits < 64; // clear bits higher than `numBits` value &= (1L << numBits) - 1; @@ -53,7 +55,7 @@ public void add(long value, int numBits) { } @Override - public void flush() { + public void flush() throws IOException { if (bufferNumBitsUsed > 0) { writeByte(buffer); bufferNumBitsUsed = 0; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java new file mode 100644 index 000000000000..8e92b9faa326 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** + * A {@link BitPacker} implementation that writes to a {@link org.apache.lucene.store.DataOutput} + */ +public final class DataOutputBitPacker extends BitPackerImplBase { + private final DataOutput dataOut; + + private long numBytesWritten; + + public DataOutputBitPacker(DataOutput dataOut) { + this.dataOut = dataOut; + } + + @Override + void writeByte(byte b) throws IOException { + dataOut.writeByte(b); + numBytesWritten++; + } + + public long getNumBytesWritten() { + return numBytesWritten; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java new file mode 100644 index 000000000000..aab496a41937 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TestTermStateCodecImpl.TermStateTestFixture; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestTermDataWriter extends LuceneTestCase { + + public void testWriterAndDeserialize() throws IOException { + TermStateTestFixture testFixture = TestTermStateCodecImpl.getTermStateTestFixture(777); + TermType expectedTermType = TermType.fromId(7); + + try (Directory testDir = newDirectory()) { + IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT); + IndexOutput metadataOut = testDir.createOutput("term_meta_1", IOContext.DEFAULT); + IndexOutput dataOut = testDir.createOutput("term_data_11", IOContext.DEFAULT); + TermDataWriter writer = new TermDataWriter(testFixture.codec(), metadataOut, dataOut); + for (var termState : testFixture.termStatesArray()) { + writer.addTermState(termState); + } + writer.finish(); + metaOut.writeByte((byte) expectedTermType.getId()); + metaOut.writeVLong(writer.getTotalMetaDataBytesWritten()); + metaOut.writeVLong(writer.getTotalDataBytesWritten()); + metaOut.close(); + metadataOut.close(); + dataOut.close(); + + BitPerBytePacker referenceBitPacker = new BitPerBytePacker(); + // total size 777; there will be 4 blocks total. + // The extra 8 byte per block is the long offset for where the block starts within data bytes. + byte[] expectedMetadata = new byte[(testFixture.codec().getMetadataBytesLength() + 8) * 4]; + ByteArrayDataOutput expectedMetadataOut = new ByteArrayDataOutput(expectedMetadata); + for (int start = 0; + start < testFixture.termStatesArray().length; + start += TermDataWriter.NUM_TERMS_PER_BLOCK) { + expectedMetadataOut.writeLong(referenceBitPacker.getCompactBytes().length); + byte[] metadata = + testFixture + .codec() + .encodeBlock( + Arrays.copyOfRange( + testFixture.termStatesArray(), + start, + Math.min( + start + TermDataWriter.NUM_TERMS_PER_BLOCK, + testFixture.termStatesArray().length)), + referenceBitPacker); + expectedMetadataOut.writeBytes(metadata, 0, metadata.length); + } + ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes()); + ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata); + TermData expected = new TermData(expectedTermType, expectedMetadataSlice, expectedDataSlice); + + IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT); + IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT); + IndexInput dataIn = testDir.openInput("term_data_11", IOContext.DEFAULT); + + TermData actual = + TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); + assertEquals(expected.termType().getId(), actual.termType().getId()); + assertByteSlice(expected.metadata(), actual.metadata()); + assertByteSlice(expected.data(), actual.data()); + testDecodeTermState(testFixture, actual); + + actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); + assertEquals(expected.termType().getId(), actual.termType().getId()); + assertByteSlice(expected.metadata(), actual.metadata()); + assertByteSlice(expected.data(), actual.data()); + testDecodeTermState(testFixture, actual); + + metaIn.close(); + metadataIn.close(); + dataIn.close(); + } + } + + private static void testDecodeTermState(TermStateTestFixture testFixture, TermData actual) + throws IOException { + for (int i = 0; i < testFixture.termStatesArray().length; i++) { + IntBlockTermState expectedTermState = testFixture.termStatesArray()[i]; + IntBlockTermState decoded = actual.getTermState(testFixture.codec(), i); + assertEquals(expectedTermState.docFreq, decoded.docFreq); + assertEquals(expectedTermState.docStartFP, decoded.docStartFP); + } + } + + private static void assertByteSlice(ByteSlice expected, ByteSlice actual) throws IOException { + assertEquals(expected.size(), actual.size()); + byte[] bytesExpected = new byte[(int) expected.size()]; + ByteArrayDataOutput out = new ByteArrayDataOutput(bytesExpected); + expected.writeAll(out); + + byte[] bytesActual = new byte[(int) actual.size()]; + ByteArrayDataOutput out1 = new ByteArrayDataOutput(bytesActual); + actual.writeAll(out1); + assertArrayEquals(bytesExpected, bytesActual); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java index 15a5e940986c..330017025cd6 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java @@ -40,7 +40,8 @@ public void testGetBitWidth() { .toArray(IntBlockTermState[]::new); byte bitWidth = - TermStateCodecComponent.getBitWidth(termStates, TermStateCodecComponent.DocFreq.INSTANCE); + TermStateCodecComponent.getBitWidth( + termStates, termStates.length, TermStateCodecComponent.DocFreq.INSTANCE); assertEquals(expectedMaxBits, bitWidth); } @@ -68,7 +69,7 @@ public void testGetBitWidthWithIncreasingValues() { byte bitWidth = TermStateCodecComponent.getBitWidth( - termStates, TermStateCodecComponent.DocStartFP.INSTANCE); + termStates, termStates.length, TermStateCodecComponent.DocStartFP.INSTANCE); assertEquals(expectedMaxBits, bitWidth); } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java index 175ac30e9407..a747b24a3144 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -17,6 +17,7 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.io.IOException; import java.util.ArrayList; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.index.IndexOptions; @@ -30,73 +31,60 @@ public class TestTermStateCodecImpl extends LuceneTestCase { - public void testEncodeDecode() { - TermStateCodecImpl codec = - new TermStateCodecImpl( - new TermStateCodecComponent[] { - TermStateCodecComponent.DocFreq.INSTANCE, TermStateCodecComponent.DocStartFP.INSTANCE, - }); - - ArrayList termStates = new ArrayList<>(); - long maxDocFreqSeen = -1; - long docStartFPBase = random().nextLong(Long.MAX_VALUE >> 1); - long maxDocStartFPDeltaSeen = -1; - for (int i = 0; i < random().nextInt(2, 256); i++) { - var termState = new IntBlockTermState(); - termState.docFreq = random().nextInt(1, 1 << random().nextInt(1, 31)); - if (i == 0) { - termState.docStartFP = docStartFPBase; - } else { - termState.docStartFP = termStates.get(i - 1).docStartFP + random().nextLong(1024); - maxDocStartFPDeltaSeen = - Math.max(maxDocStartFPDeltaSeen, termState.docStartFP - docStartFPBase); - } - maxDocFreqSeen = Math.max(maxDocFreqSeen, termState.docFreq); - termStates.add(termState); - } - - IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new); + public void testEncodeDecode() throws IOException { + TermStateTestFixture result = getTermStateTestFixture(256); BitPerBytePacker bitPerBytePacker = new BitPerBytePacker(); - byte[] metadata = codec.encodeBlock(termStatesArray, bitPerBytePacker); + byte[] metadata = result.codec().encodeBlock(result.termStatesArray(), bitPerBytePacker); // For the metadata, we expect // 0: DocFreq.bitWidth, // 1: DocStartFP.bitWidth, // [2-10]: DocStartFP.referenceValue; - int expectedDocFreqBitWidth = 64 - Long.numberOfLeadingZeros(maxDocFreqSeen); - int expectedDocStartFPBitWidth = 64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen); + int expectedDocFreqBitWidth = 64 - Long.numberOfLeadingZeros(result.maxDocFreqSeen()); + int expectedDocStartFPBitWidth = + 64 - Long.numberOfLeadingZeros(result.maxDocStartFPDeltaSeen()); assertEquals(10, metadata.length); assertEquals(expectedDocFreqBitWidth, metadata[0]); assertEquals(expectedDocStartFPBitWidth, metadata[1]); ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8); - assertEquals(docStartFPBase, byteArrayDataInput.readLong()); + assertEquals(result.docStartFPBase(), byteArrayDataInput.readLong()); // Assert with real bit-packer we get the same bytes FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = new FixedSizeByteArrayBitPacker(bitPerBytePacker.getCompactBytes().length); - codec.encodeBlock(termStatesArray, fixedSizeByteArrayBitPacker); + result.codec().encodeBlock(result.termStatesArray(), fixedSizeByteArrayBitPacker); assertArrayEquals(bitPerBytePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes()); // Assert that each term state is the same after the encode-decode roundtrip. BytesRef metadataBytes = new BytesRef(metadata); BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes()); - assertBlockRoundTrip(termStatesArray, codec, metadataBytes, dataBytes, bitPerBytePacker); + assertBlockRoundTrip( + result.termStatesArray(), result.codec(), metadataBytes, dataBytes, bitPerBytePacker); // With real compact bits instead of bit-per-byte dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes()); assertBlockRoundTrip( - termStatesArray, codec, metadataBytes, dataBytes, BitUnpackerImpl.INSTANCE); + result.termStatesArray(), + result.codec(), + metadataBytes, + dataBytes, + BitUnpackerImpl.INSTANCE); // Also test decoding that doesn't begin at the start of the block. - int pos = random().nextInt(termStatesArray.length); + int pos = random().nextInt(result.termStatesArray().length); int startBitIndex = pos > 0 ? random().nextInt(pos) : 0; int recordSize = expectedDocFreqBitWidth + expectedDocStartFPBitWidth; // With bit-per-byte bytes dataBytes = new BytesRef(bitPerBytePacker.getBytes(), pos * recordSize - startBitIndex, recordSize); assertDecodeAt( - codec, metadataBytes, dataBytes, bitPerBytePacker, startBitIndex, termStatesArray[pos]); + result.codec(), + metadataBytes, + dataBytes, + bitPerBytePacker, + startBitIndex, + result.termStatesArray()[pos]); // With compact bytes int startByteIndex = pos * recordSize / 8; @@ -104,14 +92,51 @@ public void testEncodeDecode() { int length = endByteIndex - startByteIndex + ((pos + 1) * recordSize % 8 == 0 ? 0 : 1); dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes(), startByteIndex, length); assertDecodeAt( - codec, + result.codec(), metadataBytes, dataBytes, BitUnpackerImpl.INSTANCE, (pos * recordSize) % 8, - termStatesArray[pos]); + result.termStatesArray()[pos]); } + public static TermStateTestFixture getTermStateTestFixture(int size) { + TermStateCodecImpl codec = + new TermStateCodecImpl( + new TermStateCodecComponent[] { + TermStateCodecComponent.DocFreq.INSTANCE, TermStateCodecComponent.DocStartFP.INSTANCE, + }); + + ArrayList termStates = new ArrayList<>(); + long maxDocFreqSeen = -1; + long docStartFPBase = random().nextLong(Long.MAX_VALUE >> 1); + long maxDocStartFPDeltaSeen = -1; + for (int i = 0; i < size; i++) { + var termState = new IntBlockTermState(); + termState.docFreq = random().nextInt(1, 1 << random().nextInt(1, 31)); + if (i == 0) { + termState.docStartFP = docStartFPBase; + } else { + termState.docStartFP = termStates.get(i - 1).docStartFP + random().nextLong(1024); + maxDocStartFPDeltaSeen = + Math.max(maxDocStartFPDeltaSeen, termState.docStartFP - docStartFPBase); + } + maxDocFreqSeen = Math.max(maxDocFreqSeen, termState.docFreq); + termStates.add(termState); + } + + IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new); + return new TermStateTestFixture( + codec, maxDocFreqSeen, docStartFPBase, maxDocStartFPDeltaSeen, termStatesArray); + } + + public record TermStateTestFixture( + TermStateCodecImpl codec, + long maxDocFreqSeen, + long docStartFPBase, + long maxDocStartFPDeltaSeen, + IntBlockTermState[] termStatesArray) {} + private static void assertDecodeAt( TermStateCodecImpl codec, BytesRef metadataBytes, diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java index 2df2a74907e2..b1bf4bfa463e 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java @@ -56,6 +56,10 @@ public byte[] getBytes() { } public byte[] getCompactBytes() { + if (totalNumBits == 0) { + return new byte[0]; + } + int len = (totalNumBits - 1) / 8 + 1; // round up byte[] bytes = new byte[len]; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java index 84ae93fe4e52..9f50777176ac 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java @@ -17,12 +17,13 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; +import java.io.IOException; import java.util.Arrays; import org.apache.lucene.tests.util.LuceneTestCase; public class TestBitPackerImpl extends LuceneTestCase { - public void testBasic() { + public void testBasic() throws IOException { FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = new FixedSizeByteArrayBitPacker(5); for (int i = 1; i <= 10; i++) { fixedSizeByteArrayBitPacker.add(i, 4); @@ -33,7 +34,7 @@ public void testBasic() { assertArrayEquals(expectedBytes, fixedSizeByteArrayBitPacker.getBytes()); } - public void testRandom() { + public void testRandom() throws IOException { ValueAndBitWidth[] randomInputs = ValueAndBitWidth.getRandomArray(random(), 1000); int totalNumberBits = Arrays.stream(randomInputs).mapToInt(ValueAndBitWidth::bitWidth).sum(); From 877d1cf3cda468cc0d52655feadbc3d86dade3ea Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 16 Nov 2023 14:59:01 -0800 Subject: [PATCH 25/57] remove unneeded initialization of int to 0. --- .../randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java index a8be9aca89bd..b40075a1ee8d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java @@ -23,7 +23,7 @@ */ public final class FixedSizeByteArrayBitPacker extends BitPackerImplBase { private final byte[] bytes; - private int numBytesUsed = 0; + private int numBytesUsed; public FixedSizeByteArrayBitPacker(int capacity) { this.bytes = new byte[capacity]; From 2cdfb04a2dedcbe978371d4074e37deeb6240c0c Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 16 Nov 2023 15:27:10 -0800 Subject: [PATCH 26/57] Support serialize/deserialize for TermsStats --- .../lucene99/randomaccess/TermsStats.java | 43 +++++++++++- .../lucene99/randomaccess/TestTermsStats.java | 66 +++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java index d5156d7455e9..af58932483c2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java @@ -17,6 +17,10 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.io.IOException; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; /** Data class that holds starts for term stats for a field */ @@ -26,4 +30,41 @@ record TermsStats( long sumDocFreq, int docCount, BytesRef minTerm, - BytesRef maxTerm) {} + BytesRef maxTerm) { + + void serialize(DataOutput output) throws IOException { + output.writeVLong(size); + output.writeVLong(sumTotalTermFreq); + output.writeVLong(sumDocFreq); + output.writeVInt(docCount); + writeBytesRef(output, minTerm); + writeBytesRef(output, maxTerm); + } + + static TermsStats deserialize(DataInput input) throws IOException { + return new TermsStats( + input.readVLong(), + input.readVLong(), + input.readVLong(), + input.readVInt(), + readBytesRef(input), + readBytesRef(input)); + } + + static void writeBytesRef(DataOutput output, BytesRef bytes) throws IOException { + output.writeVInt(bytes.length); + output.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } + + static BytesRef readBytesRef(DataInput input) throws IOException { + int numBytes = input.readVInt(); + if (numBytes < 0) { + throw new CorruptIndexException("invalid bytes length: " + numBytes, input); + } + + byte[] bytes = new byte[numBytes]; + input.readBytes(bytes, 0, numBytes); + + return new BytesRef(bytes); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java new file mode 100644 index 000000000000..b7ca5f2efbe4 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + + +public class TestTermsStats extends LuceneTestCase { + + public void testRoundTrip() throws IOException { + TermsStats expected = makeRandom(); + + try (Directory dir = newDirectory()) { + IndexOutput output = dir.createOutput("terms_stats", IOContext.DEFAULT); + expected.serialize(output); + output.close(); + + IndexInput input = dir.openInput("terms_stats", IOContext.DEFAULT); + TermsStats actual = TermsStats.deserialize(input); + + assertEquals(expected, actual); + input.close(); + } + } + + private TermsStats makeRandom() { + byte[] minBytes = getRandomBytes(); + byte[] maxBytes = getRandomBytes(); + return new TermsStats( + random().nextLong(1, Long.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextInt(1, Integer.MAX_VALUE), + new BytesRef(minBytes), + new BytesRef(maxBytes) + ); + } + + private static byte[] getRandomBytes() { + byte[] minBytes = new byte[random().nextInt(100)]; + random().nextBytes(minBytes); + return minBytes; + } +} \ No newline at end of file From 777c40d3bad78320f25c11c398712b3a07ea0789 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Thu, 16 Nov 2023 15:28:00 -0800 Subject: [PATCH 27/57] Explictlty mark the generic type arugment of FSTCompiler in TermsIndexBuilder Some platform+jdk can't inference the type. See: https://github.com/apache/lucene/actions/runs/6897462250/job/18765715011?pr=12688 --- .../sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java index 9484a0505458..f552adba433c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -35,7 +35,7 @@ final class TermsIndexBuilder { private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; private final FSTCompiler fstCompiler = - new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); + new FSTCompiler(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); TermsIndexBuilder() { Arrays.fill(countPerType, -1); From 8a0b1ccd558227680fce1837e198cf89ce6e0c11 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Fri, 17 Nov 2023 16:42:27 -0800 Subject: [PATCH 28/57] Implement writing random-access term dictionary * RandomAccessTermsDictWriter writes to index files * RandomAccessTermsDict deserializes from index files and support a lookup API --- .../randomaccess/RandomAccessTermsDict.java | 60 +++++- .../RandomAccessTermsDictWriter.java | 182 +++++++++++++++++ .../lucene99/randomaccess/TermData.java | 9 +- .../lucene99/randomaccess/TermDataReader.java | 74 +++++++ .../lucene99/randomaccess/TermDataWriter.java | 2 +- .../randomaccess/TermStateCodecImpl.java | 19 +- .../lucene99/randomaccess/TermsDataStore.java | 48 ----- .../lucene99/randomaccess/TermsImpl.java | 2 +- .../lucene99/randomaccess/TermsIndex.java | 6 +- .../randomaccess/TermsIndexBuilder.java | 4 +- .../lucene99/randomaccess/TermsStats.java | 3 + .../TestRandomAccessTermsDictWriter.java | 184 ++++++++++++++++++ .../randomaccess/TestTermDataWriter.java | 6 +- .../randomaccess/TestTermStateCodecImpl.java | 10 +- .../lucene99/randomaccess/TestTermsStats.java | 62 +++--- 15 files changed, 554 insertions(+), 117 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java index 26451dd9f938..39947f9ff78c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java @@ -17,6 +17,64 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; + /** A term dictionary that offer random-access to read a specific term */ record RandomAccessTermsDict( - TermsStats termsStats, TermsIndex termsIndex, TermsDataStore termsDataStore) {} + TermsStats termsStats, TermsIndex termsIndex, TermDataReader termDataReader) { + + IntBlockTermState getTermState(BytesRef term) throws IOException { + TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term); + return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord()); + } + + static RandomAccessTermsDict deserialize( + IndexOptionsProvider indexOptionsProvider, + DataInput metaInput, + DataInput termIndexInput, + TermDataInputProvider termDataInputProvider) + throws IOException { + + // (1) deserialize field stats + TermsStats termsStats = TermsStats.deserialize(metaInput); + IndexOptions indexOptions = indexOptionsProvider.getIndexOptions(termsStats.fieldNumber()); + + // (2) deserialize terms index + TermsIndex termsIndex = + TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true); + + // (3) deserialize all the term data by each TermType + // (3.1) number of unique TermType this field has + int numTermTypes = metaInput.readByte(); + + // (3.2) read per TermType + TermDataReader.Builder termDataReaderBuilder = new TermDataReader.Builder(indexOptions); + for (int i = 0; i < numTermTypes; i++) { + TermType termType = TermType.fromId(metaInput.readByte()); + TermDataInput termDataInput = termDataInputProvider.getTermDataInputForType(termType); + termDataReaderBuilder.readOne( + termType, metaInput, termDataInput.metadataInput, termDataInput.dataInput); + } + + return new RandomAccessTermsDict(termsStats, termsIndex, termDataReaderBuilder.build()); + } + + @FunctionalInterface + interface IndexOptionsProvider { + + IndexOptions getIndexOptions(int fieldNumber); + } + + record TermDataInput(IndexInput metadataInput, IndexInput dataInput) {} + + @FunctionalInterface + interface TermDataInputProvider { + + TermDataInput getTermDataInputForType(TermType termType) throws IOException; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java new file mode 100644 index 000000000000..a89f6e94c6de --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; + +/** Class to write the index files for one field. */ +final class RandomAccessTermsDictWriter { + /** externally provided * */ + private final int filedNumber; + + private final IndexOptions indexOptions; + private final DataOutput metaOutput; + + private final DataOutput indexOutput; + + private final TermDataOutputProvider termDataOutputProvider; + + /** Internal states below * */ + private final TermDataOutput[] termDataOutputPerType = + new TermDataOutput[TermType.NUM_TOTAL_TYPES]; + + private final TermsIndexBuilder termsIndexBuilder = new TermsIndexBuilder(); + + private final TermDataWriter[] termDataWriterPerType = + new TermDataWriter[TermType.NUM_TOTAL_TYPES]; + + private final TermStatsTracker termStatsTracker; + + private BytesRef previousTerm; + + RandomAccessTermsDictWriter( + int filedNumber, + IndexOptions indexOptions, + DataOutput metaOutput, + DataOutput indexOutput, + TermDataOutputProvider termDataOutputProvider) { + this.filedNumber = filedNumber; + this.indexOptions = indexOptions; + this.metaOutput = metaOutput; + this.indexOutput = indexOutput; + this.termDataOutputProvider = termDataOutputProvider; + this.termStatsTracker = new TermStatsTracker(filedNumber); + } + + void add(BytesRef term, IntBlockTermState termState) throws IOException { + TermType termType = TermType.fromTermState(termState); + if (previousTerm == null) { + // first term, which is also the minimum term + termStatsTracker.setMinTerm(term); + } + termStatsTracker.recordTerm(termState); + previousTerm = term; + termsIndexBuilder.addTerm(term, termType); + TermDataWriter termDataWriter = getTermDataWriterForType(termType); + termDataWriter.addTermState(termState); + } + + private TermDataWriter getTermDataWriterForType(TermType termType) throws IOException { + if (termDataWriterPerType[termType.getId()] != null) { + return termDataWriterPerType[termType.getId()]; + } + + TermDataOutput termDataOutput = getTermDataOutput(termType); + TermDataWriter termDataWriter = + new TermDataWriter( + TermStateCodecImpl.getCodec(termType, indexOptions), + termDataOutput.metadataOutput(), + termDataOutput.dataOutput()); + termDataWriterPerType[termType.getId()] = termDataWriter; + return termDataWriter; + } + + private TermDataOutput getTermDataOutput(TermType termType) throws IOException { + if (termDataOutputPerType[termType.getId()] == null) { + termDataOutputPerType[termType.getId()] = + termDataOutputProvider.getTermDataOutputForType(termType); + } + return termDataOutputPerType[termType.getId()]; + } + + void finish(int docCount) throws IOException { + // finish up TermsStats for this field + termStatsTracker.setMaxTerm(previousTerm); + termStatsTracker.setDocCount(docCount); + TermsStats termsStats = termStatsTracker.finish(); + // (1) Write field metadata + termsStats.serialize(metaOutput); + + // (2) serialize the term index + termsIndexBuilder.build().serialize(metaOutput, indexOutput); + + // (3) serialize information needed to decode per-TermType TermData + // (3.1) number of unique TermTypes this field has + int numTermTypesSeen = 0; + for (var termDataWriter : termDataWriterPerType) { + if (termDataWriter != null) { + numTermTypesSeen += 1; + } + } + metaOutput.writeByte((byte) numTermTypesSeen); + + // (3.2) (termType, metadataLength, dataLength) for each TermData + for (int i = 0; i < termDataWriterPerType.length; i++) { + var termDataWriter = termDataWriterPerType[i]; + if (termDataWriter != null) { + termDataWriter.finish(); + metaOutput.writeByte((byte) i); + metaOutput.writeVLong(termDataWriter.getTotalMetaDataBytesWritten()); + metaOutput.writeVLong(termDataWriter.getTotalDataBytesWritten()); + } + } + } + + record TermDataOutput(IndexOutput metadataOutput, IndexOutput dataOutput) {} + + @FunctionalInterface + static interface TermDataOutputProvider { + + TermDataOutput getTermDataOutputForType(TermType termType) throws IOException; + } + + static final class TermStatsTracker { + final int fieldNumber; + long size; + long sumTotalTermFreq; + long sumDocFreq; + int docCount; + BytesRef minTerm; + BytesRef maxTerm; + + TermStatsTracker(int fieldNumber) { + this.fieldNumber = fieldNumber; + } + + void recordTerm(IntBlockTermState termState) { + size += 1; + sumDocFreq += termState.docFreq; + sumTotalTermFreq += termState.totalTermFreq; + } + + void setDocCount(int docCount) { + this.docCount = docCount; + } + + void setMinTerm(BytesRef minTerm) { + this.minTerm = minTerm; + } + + void setMaxTerm(BytesRef maxTerm) { + this.maxTerm = maxTerm; + } + + TermsStats finish() { + assert docCount > 0 && minTerm != null && maxTerm != null; + + return new TermsStats( + fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java index 4e8f79738e59..9c74ffc83835 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -29,7 +29,7 @@ * Holds the bit-packed {@link IntBlockTermState} for a given {@link * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType} */ -record TermData(TermType termType, ByteSlice metadata, ByteSlice data) { +record TermData(ByteSlice metadata, ByteSlice data) { IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException { long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; @@ -53,7 +53,6 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio static TermData deserializeOnHeap( DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException { - TermType termType = TermType.fromId(metaInput.readByte()); long metadataSize = metaInput.readVLong(); long dataSize = metaInput.readVLong(); @@ -72,13 +71,11 @@ static TermData deserializeOnHeap( metadataInput.readBytes(metadataBytes, 0, metadataBytes.length); dataInput.readBytes(dataBytes, 0, dataBytes.length); - return new TermData( - termType, new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes)); + return new TermData(new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes)); } static TermData deserializeOffHeap( DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException { - TermType termType = TermType.fromId(metaInput.readByte()); long metadataSize = metaInput.readVLong(); long dataSize = metaInput.readVLong(); @@ -89,6 +86,6 @@ static TermData deserializeOffHeap( dataInput.skipBytes(dataSize); return new TermData( - termType, new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data)); + new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data)); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java new file mode 100644 index 000000000000..3a7ebd1e8a7c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; + +/** + * Holds all {@link TermData} per {@link TermType} for a field. Also manages the proper codec needed + * per TermType. + */ +record TermDataReader(TermDataAndCodec[] termDataAndCodecs) { + + IntBlockTermState getTermState(TermType termType, long ord) throws IOException { + assert termDataAndCodecs[termType.getId()] != null; + var dataAndCodec = termDataAndCodecs[termType.getId()]; + IntBlockTermState termState = dataAndCodec.termData.getTermState(dataAndCodec.codec, ord); + + // need to filling some default values for the term state + // in order to meet the expectations of the postings reader + if (termType.hasSingletonDoc()) { + termState.docFreq = 1; + } + if (termType.hasSkipData() == false) { + termState.skipOffset = -1; + } + if (termType.hasLastPositionBlockOffset() == false) { + termState.lastPosBlockOffset = -1; + } + + return termState; + } + + static class Builder { + final IndexOptions indexOptions; + final TermDataAndCodec[] termDataAndCodecs = new TermDataAndCodec[TermType.NUM_TOTAL_TYPES]; + + Builder(IndexOptions indexOptions) { + this.indexOptions = indexOptions; + } + + void readOne( + TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn) + throws IOException { + TermData termData = TermData.deserializeOffHeap(metaIn, termMetadataIn, termDataIn); + TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions); + termDataAndCodecs[termType.getId()] = new TermDataAndCodec(termData, codec); + } + + TermDataReader build() { + return new TermDataReader(termDataAndCodecs); + } + } + + record TermDataAndCodec(TermData termData, TermStateCodec codec) {} +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java index 09ab3cba9242..d69c45de9abc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java @@ -22,7 +22,7 @@ import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.DataOutputBitPacker; import org.apache.lucene.store.DataOutput; -/** Writes TermData to */ +/** Writes TermData to two separate {@link DataOutput} one for metadata, another for term data */ final class TermDataWriter { static final int NUM_TERMS_PER_BLOCK = 256; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index 3dc0a69f0c05..734e24a7a057 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -73,31 +73,24 @@ public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexO assert !(termType.hasSkipData() && termType.hasSingletonDoc()); ArrayList components = new ArrayList<>(); - // handle docs + // handle docs and docFreq if (termType.hasSingletonDoc()) { components.add(SingletonDocId.INSTANCE); } else { components.add(DocStartFP.INSTANCE); + components.add(DocFreq.INSTANCE); } // handle skip data if (termType.hasSkipData()) { components.add(SkipOffset.INSTANCE); } - // handle docFreq - boolean totalTermFeqAdded = false; + + // handle freq if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { - if (termType.hasSingletonDoc()) { - components.add(TotalTermFreq.INSTANCE); - totalTermFeqAdded = true; - } else { - components.add(DocFreq.INSTANCE); - } + components.add(TotalTermFreq.INSTANCE); } // handle positions if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { - if (!totalTermFeqAdded) { - components.add(TotalTermFreq.INSTANCE); - } components.add(PositionStartFP.INSTANCE); if (termType.hasLastPositionBlockOffset()) { components.add(LastPositionBlockOffset.INSTANCE); @@ -145,7 +138,7 @@ public byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int uptop, BitPacker b } private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs, int upTo) { - Metadata[] metadataPerComponent = new Metadata[upTo]; + Metadata[] metadataPerComponent = new Metadata[components.length]; for (int i = 0; i < components.length; i++) { var component = components[i]; byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, upTo, component); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java deleted file mode 100644 index 1717d26aa780..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; - -import java.util.Arrays; - -/** Holds all {@link TermData} for all {@link TermType} for a field. */ -class TermsDataStore { - private final TermData[] dataPerTermType; - - private TermsDataStore(TermData[] dataPerTermType) { - this.dataPerTermType = dataPerTermType; - } - - static class Builder { - private final TermData[] dataPerTermType; - - Builder() { - dataPerTermType = new TermData[TermType.NUM_TOTAL_TYPES]; - Arrays.fill(dataPerTermType, null); - } - - void add(TermData termData) { - assert dataPerTermType[termData.termType().getId()] == null; - - dataPerTermType[termData.termType().getId()] = termData; - } - - TermsDataStore build() { - return new TermsDataStore(dataPerTermType); - } - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index 29a9c4124e7b..edbf1141457f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -25,7 +25,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; -class TermsImpl extends Terms { +final class TermsImpl extends Terms { private final FieldInfo fieldInfo; private final RandomAccessTermsDict termsDict; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index 917989b51b43..ce53493b8522 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -35,13 +35,13 @@ TypeAndOrd getTerm(BytesRef term) throws IOException { return new TypeAndOrd(termType, ord); } - public record TypeAndOrd(TermType termType, long ord) {} + record TypeAndOrd(TermType termType, long ord) {} - public void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException { + void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException { fst.save(metaOut, dataOut); } - public TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap) + static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap) throws IOException { FST fst; if (loadOffHeap) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java index f552adba433c..824803847b2d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -31,11 +31,11 @@ * ordinals are scoped to type (not global). */ final class TermsIndexBuilder { - private static long MAX_ORD = (1L << 60) - 1; + private static final long MAX_ORD = (1L << 60) - 1; private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; private final FSTCompiler fstCompiler = - new FSTCompiler(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); + new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); TermsIndexBuilder() { Arrays.fill(countPerType, -1); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java index af58932483c2..0c65f2e04d39 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java @@ -25,6 +25,7 @@ /** Data class that holds starts for term stats for a field */ record TermsStats( + int fieldNumber, long size, long sumTotalTermFreq, long sumDocFreq, @@ -33,6 +34,7 @@ record TermsStats( BytesRef maxTerm) { void serialize(DataOutput output) throws IOException { + output.writeVInt(fieldNumber); output.writeVLong(size); output.writeVLong(sumTotalTermFreq); output.writeVLong(sumDocFreq); @@ -43,6 +45,7 @@ void serialize(DataOutput output) throws IOException { static TermsStats deserialize(DataInput input) throws IOException { return new TermsStats( + input.readVInt(), input.readVLong(), input.readVLong(), input.readVLong(), diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java new file mode 100644 index 000000000000..179175f7398f --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDict.TermDataInput; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDict.TermDataInputProvider; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutput; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutputProvider; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; + +public class TestRandomAccessTermsDictWriter extends LuceneTestCase { + + public void testBuildIndexAndRead() throws IOException { + try (Directory testDir = newDirectory()) { + IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT); + IndexOutput termIndexOut = testDir.createOutput("term_index", IOContext.DEFAULT); + HashMap termDataOutputsMap = new HashMap<>(); + TermDataOutputProvider outputProvider = + termType -> + termDataOutputsMap.computeIfAbsent( + termType, + t -> { + try { + return new TermDataOutput( + testDir.createOutput("term_meta_" + t.getId(), IOContext.DEFAULT), + testDir.createOutput("term_data_" + t.getId(), IOContext.DEFAULT)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + int fieldNumber = random().nextInt(0, 10); + IndexOptions indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; + RandomAccessTermsDictWriter randomAccessTermsDictWriter = + new RandomAccessTermsDictWriter( + fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider); + + TermAndState[] expectedTermAndState = getRandoms(1000, 2000); + int expectedDocCount = random().nextInt(1, 2000); + + for (var x : expectedTermAndState) { + randomAccessTermsDictWriter.add(x.term, x.state); + } + randomAccessTermsDictWriter.finish(expectedDocCount); + + metaOut.close(); + termIndexOut.close(); + for (var e : termDataOutputsMap.values()) { + e.dataOutput().close(); + e.metadataOutput().close(); + } + + IndexInput metaInput = testDir.openInput("segment_meta", IOContext.READ); + IndexInput termIndexInput = testDir.openInput("term_index", IOContext.LOAD); + HashMap termDataInputsMap = new HashMap<>(); + TermDataInputProvider termDataInputProvider = + termType -> + termDataInputsMap.computeIfAbsent( + termType, + t -> { + try { + return new TermDataInput( + testDir.openInput("term_meta_" + t.getId(), IOContext.LOAD), + testDir.openInput("term_data_" + t.getId(), IOContext.LOAD)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + RandomAccessTermsDict deserialized = + RandomAccessTermsDict.deserialize( + _fieldNumber -> indexOptions, metaInput, termIndexInput, termDataInputProvider); + + assertEquals(fieldNumber, deserialized.termsStats().fieldNumber()); + assertEquals(expectedDocCount, deserialized.termsStats().docCount()); + assertEquals(expectedTermAndState.length, deserialized.termsStats().size()); + assertEquals( + Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(), + deserialized.termsStats().sumDocFreq()); + assertEquals( + Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(), + deserialized.termsStats().sumTotalTermFreq()); + assertEquals(expectedTermAndState.length, deserialized.termsStats().size()); + assertEquals(expectedTermAndState[0].term, deserialized.termsStats().minTerm()); + assertEquals(expectedTermAndState[expectedTermAndState.length - 1].term, deserialized.termsStats().maxTerm()); + + for (var x : expectedTermAndState) { + IntBlockTermState expectedState = x.state; + IntBlockTermState actualState = deserialized.getTermState(x.term); + if (expectedState.singletonDocID != -1) { + assertEquals(expectedState.singletonDocID, actualState.singletonDocID); + } else { + assertEquals(expectedState.docStartFP, actualState.docStartFP); + } + assertEquals(expectedState.docFreq, actualState.docFreq); + assertEquals(expectedState.totalTermFreq, actualState.totalTermFreq); + assertEquals(expectedState.skipOffset, actualState.skipOffset); + assertEquals(expectedState.posStartFP, actualState.posStartFP); + assertEquals(expectedState.payStartFP, actualState.payStartFP); + assertEquals(expectedState.lastPosBlockOffset, actualState.lastPosBlockOffset); + } + + metaInput.close(); + termIndexInput.close(); + for (var e : termDataInputsMap.values()) { + e.metadataInput().close(); + e.dataInput().close(); + } + } + } + + TermAndState[] getRandoms(int size, int maxDoc) { + IntBlockTermState lastTermState = null; + + ArrayList result = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + byte[] termBytes = new byte[4]; + BitUtil.VH_BE_INT.set(termBytes, 0, i); + + IntBlockTermState termState = new IntBlockTermState(); + termState.docFreq = random().nextInt(1, 100); + if (termState.docFreq == 1) { + termState.singletonDocID = random().nextInt(0, maxDoc); + } else { + termState.singletonDocID = -1; + } + if (lastTermState == null) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } else { + termState.docStartFP = lastTermState.docStartFP; + termState.posStartFP = lastTermState.posStartFP; + termState.payStartFP = lastTermState.payStartFP; + termState.docStartFP += termState.docFreq == 1 ? 0 : random().nextLong(1, 256); + termState.posStartFP += random().nextLong(1, 256); + termState.payStartFP += random().nextLong(1, 256); + } + termState.totalTermFreq = random().nextLong(termState.docFreq, 1000); + if (termState.docFreq > 1 && random().nextBoolean()) { + termState.skipOffset = random().nextLong(1, 256); + } else { + termState.skipOffset = -1; + } + if (random().nextBoolean()) { + termState.lastPosBlockOffset = random().nextLong(1, 256); + } else { + termState.lastPosBlockOffset = -1; + } + lastTermState = termState; + result.add(new TermAndState(new BytesRef(termBytes), termState)); + } + + return result.toArray(TermAndState[]::new); + } + + record TermAndState(BytesRef term, IntBlockTermState state) {} +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java index aab496a41937..2ddc5b4ee67f 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java @@ -33,7 +33,6 @@ public class TestTermDataWriter extends LuceneTestCase { public void testWriterAndDeserialize() throws IOException { TermStateTestFixture testFixture = TestTermStateCodecImpl.getTermStateTestFixture(777); - TermType expectedTermType = TermType.fromId(7); try (Directory testDir = newDirectory()) { IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT); @@ -44,7 +43,6 @@ public void testWriterAndDeserialize() throws IOException { writer.addTermState(termState); } writer.finish(); - metaOut.writeByte((byte) expectedTermType.getId()); metaOut.writeVLong(writer.getTotalMetaDataBytesWritten()); metaOut.writeVLong(writer.getTotalDataBytesWritten()); metaOut.close(); @@ -75,7 +73,7 @@ public void testWriterAndDeserialize() throws IOException { } ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes()); ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata); - TermData expected = new TermData(expectedTermType, expectedMetadataSlice, expectedDataSlice); + TermData expected = new TermData(expectedMetadataSlice, expectedDataSlice); IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT); IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT); @@ -83,13 +81,11 @@ public void testWriterAndDeserialize() throws IOException { TermData actual = TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); - assertEquals(expected.termType().getId(), actual.termType().getId()); assertByteSlice(expected.metadata(), actual.metadata()); assertByteSlice(expected.data(), actual.data()); testDecodeTermState(testFixture, actual); actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); - assertEquals(expected.termType().getId(), actual.termType().getId()); assertByteSlice(expected.metadata(), actual.metadata()); assertByteSlice(expected.data(), actual.data()); testDecodeTermState(testFixture, actual); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java index a747b24a3144..db7630f1f35a 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -195,11 +195,11 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index && !termType.hasSkipData() && !termType.hasSingletonDoc(); components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { - components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { - components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); } if (indexOptions.ordinal() @@ -233,12 +233,12 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index && termType.hasSkipData() && !termType.hasSingletonDoc(); components.add(TermStateCodecComponent.DocStartFP.INSTANCE); + components.add(TermStateCodecComponent.DocFreq.INSTANCE); components.add(TermStateCodecComponent.SkipOffset.INSTANCE); if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { - components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { - components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); } if (indexOptions.ordinal() @@ -290,8 +290,8 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index && !termType.hasSingletonDoc(); assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal(); components.add(TermStateCodecComponent.DocStartFP.INSTANCE); - components.add(TermStateCodecComponent.SkipOffset.INSTANCE); components.add(TermStateCodecComponent.DocFreq.INSTANCE); + components.add(TermStateCodecComponent.SkipOffset.INSTANCE); components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java index b7ca5f2efbe4..8937c5f9e319 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java @@ -17,6 +17,7 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import java.io.IOException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -24,43 +25,40 @@ import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; -import java.io.IOException; - - public class TestTermsStats extends LuceneTestCase { - public void testRoundTrip() throws IOException { - TermsStats expected = makeRandom(); + public void testRoundTrip() throws IOException { + TermsStats expected = makeRandom(); - try (Directory dir = newDirectory()) { - IndexOutput output = dir.createOutput("terms_stats", IOContext.DEFAULT); - expected.serialize(output); - output.close(); + try (Directory dir = newDirectory()) { + IndexOutput output = dir.createOutput("terms_stats", IOContext.DEFAULT); + expected.serialize(output); + output.close(); - IndexInput input = dir.openInput("terms_stats", IOContext.DEFAULT); - TermsStats actual = TermsStats.deserialize(input); + IndexInput input = dir.openInput("terms_stats", IOContext.DEFAULT); + TermsStats actual = TermsStats.deserialize(input); - assertEquals(expected, actual); - input.close(); - } + assertEquals(expected, actual); + input.close(); } + } - private TermsStats makeRandom() { - byte[] minBytes = getRandomBytes(); - byte[] maxBytes = getRandomBytes(); - return new TermsStats( - random().nextLong(1, Long.MAX_VALUE), - random().nextLong(1, Long.MAX_VALUE), - random().nextLong(1, Long.MAX_VALUE), - random().nextInt(1, Integer.MAX_VALUE), - new BytesRef(minBytes), - new BytesRef(maxBytes) - ); - } + private TermsStats makeRandom() { + byte[] minBytes = getRandomBytes(); + byte[] maxBytes = getRandomBytes(); + return new TermsStats( + random().nextInt(1, Integer.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextLong(1, Long.MAX_VALUE), + random().nextInt(1, Integer.MAX_VALUE), + new BytesRef(minBytes), + new BytesRef(maxBytes)); + } - private static byte[] getRandomBytes() { - byte[] minBytes = new byte[random().nextInt(100)]; - random().nextBytes(minBytes); - return minBytes; - } -} \ No newline at end of file + private static byte[] getRandomBytes() { + byte[] minBytes = new byte[random().nextInt(100)]; + random().nextBytes(minBytes); + return minBytes; + } +} From 83968309e767a039b33b0b6742bbaa8fda089cd0 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Fri, 17 Nov 2023 16:55:34 -0800 Subject: [PATCH 29/57] Fix build after mering from apahce:main --- .../randomaccess/RandomAccessTermsDictWriter.java | 1 - .../lucene99/randomaccess/TermsIndexBuilder.java | 2 +- .../TestRandomAccessTermsDictWriter.java | 12 +++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java index a89f6e94c6de..fab30774c665 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -18,7 +18,6 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; -import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.store.DataOutput; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java index 824803847b2d..d142420d4470 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -35,7 +35,7 @@ final class TermsIndexBuilder { private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; private final FSTCompiler fstCompiler = - new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()); + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build(); TermsIndexBuilder() { Arrays.fill(countPerType, -1); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java index 179175f7398f..b9d2a444894d 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java @@ -101,14 +101,16 @@ public void testBuildIndexAndRead() throws IOException { assertEquals(expectedDocCount, deserialized.termsStats().docCount()); assertEquals(expectedTermAndState.length, deserialized.termsStats().size()); assertEquals( - Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(), - deserialized.termsStats().sumDocFreq()); + Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(), + deserialized.termsStats().sumDocFreq()); assertEquals( - Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(), - deserialized.termsStats().sumTotalTermFreq()); + Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(), + deserialized.termsStats().sumTotalTermFreq()); assertEquals(expectedTermAndState.length, deserialized.termsStats().size()); assertEquals(expectedTermAndState[0].term, deserialized.termsStats().minTerm()); - assertEquals(expectedTermAndState[expectedTermAndState.length - 1].term, deserialized.termsStats().maxTerm()); + assertEquals( + expectedTermAndState[expectedTermAndState.length - 1].term, + deserialized.termsStats().maxTerm()); for (var x : expectedTermAndState) { IntBlockTermState expectedState = x.state; From 96d6e3320b290aa654ea5dec53c22c51c25ca894 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sat, 18 Nov 2023 16:03:26 -0800 Subject: [PATCH 30/57] Test serailize/deserialize multiple fields' term dictionary Also fix a bug in loading term index FST offheap. --- .../lucene99/randomaccess/TermsIndex.java | 4 +- .../TestRandomAccessTermsDictWriter.java | 135 ++++++++++++------ 2 files changed, 91 insertions(+), 48 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index ce53493b8522..d0a4c0c4c56b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -45,7 +45,9 @@ static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOf throws IOException { FST fst; if (loadOffHeap) { - fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), new OffHeapFSTStore()); + var fstStore = new OffHeapFSTStore(); + fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), fstStore); + dataIn.skipBytes(fstStore.size()); } else { fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton()); } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java index b9d2a444894d..d4b1f94aab04 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java @@ -36,8 +36,9 @@ import org.apache.lucene.util.BytesRef; public class TestRandomAccessTermsDictWriter extends LuceneTestCase { + int nextFieldNumber; - public void testBuildIndexAndRead() throws IOException { + public void testBuildIndexAndReadMultipleFields() throws IOException { try (Directory testDir = newDirectory()) { IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT); IndexOutput termIndexOut = testDir.createOutput("term_index", IOContext.DEFAULT); @@ -56,19 +57,10 @@ public void testBuildIndexAndRead() throws IOException { } }); - int fieldNumber = random().nextInt(0, 10); - IndexOptions indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; - RandomAccessTermsDictWriter randomAccessTermsDictWriter = - new RandomAccessTermsDictWriter( - fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider); - - TermAndState[] expectedTermAndState = getRandoms(1000, 2000); - int expectedDocCount = random().nextInt(1, 2000); - - for (var x : expectedTermAndState) { - randomAccessTermsDictWriter.add(x.term, x.state); + ExpectedResults[] manyExpectedResults = new ExpectedResults[random().nextInt(1, 20)]; + for (int i = 0; i < manyExpectedResults.length; i++) { + manyExpectedResults[i] = indexOneField(metaOut, termIndexOut, outputProvider); } - randomAccessTermsDictWriter.finish(expectedDocCount); metaOut.close(); termIndexOut.close(); @@ -93,39 +85,10 @@ public void testBuildIndexAndRead() throws IOException { throw new RuntimeException(e); } }); - RandomAccessTermsDict deserialized = - RandomAccessTermsDict.deserialize( - _fieldNumber -> indexOptions, metaInput, termIndexInput, termDataInputProvider); - - assertEquals(fieldNumber, deserialized.termsStats().fieldNumber()); - assertEquals(expectedDocCount, deserialized.termsStats().docCount()); - assertEquals(expectedTermAndState.length, deserialized.termsStats().size()); - assertEquals( - Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(), - deserialized.termsStats().sumDocFreq()); - assertEquals( - Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(), - deserialized.termsStats().sumTotalTermFreq()); - assertEquals(expectedTermAndState.length, deserialized.termsStats().size()); - assertEquals(expectedTermAndState[0].term, deserialized.termsStats().minTerm()); - assertEquals( - expectedTermAndState[expectedTermAndState.length - 1].term, - deserialized.termsStats().maxTerm()); - - for (var x : expectedTermAndState) { - IntBlockTermState expectedState = x.state; - IntBlockTermState actualState = deserialized.getTermState(x.term); - if (expectedState.singletonDocID != -1) { - assertEquals(expectedState.singletonDocID, actualState.singletonDocID); - } else { - assertEquals(expectedState.docStartFP, actualState.docStartFP); - } - assertEquals(expectedState.docFreq, actualState.docFreq); - assertEquals(expectedState.totalTermFreq, actualState.totalTermFreq); - assertEquals(expectedState.skipOffset, actualState.skipOffset); - assertEquals(expectedState.posStartFP, actualState.posStartFP); - assertEquals(expectedState.payStartFP, actualState.payStartFP); - assertEquals(expectedState.lastPosBlockOffset, actualState.lastPosBlockOffset); + + for (var expectedResult : manyExpectedResults) { + assertDeserializedMatchingExpected( + expectedResult, metaInput, termIndexInput, termDataInputProvider); } metaInput.close(); @@ -137,7 +100,85 @@ public void testBuildIndexAndRead() throws IOException { } } - TermAndState[] getRandoms(int size, int maxDoc) { + private static void assertDeserializedMatchingExpected( + ExpectedResults result, + IndexInput metaInput, + IndexInput termIndexInput, + TermDataInputProvider termDataInputProvider) + throws IOException { + RandomAccessTermsDict deserialized = + RandomAccessTermsDict.deserialize( + _fieldNumber -> result.indexOptions(), + metaInput, + termIndexInput, + termDataInputProvider); + + assertEquals(result.fieldNumber(), deserialized.termsStats().fieldNumber()); + assertEquals(result.expectedDocCount(), deserialized.termsStats().docCount()); + assertEquals(result.expectedTermAndState().length, deserialized.termsStats().size()); + assertEquals( + Arrays.stream(result.expectedTermAndState()).mapToLong(x -> x.state.docFreq).sum(), + deserialized.termsStats().sumDocFreq()); + assertEquals( + Arrays.stream(result.expectedTermAndState()).mapToLong(x -> x.state.totalTermFreq).sum(), + deserialized.termsStats().sumTotalTermFreq()); + assertEquals(result.expectedTermAndState().length, deserialized.termsStats().size()); + assertEquals(result.expectedTermAndState()[0].term, deserialized.termsStats().minTerm()); + assertEquals( + result.expectedTermAndState()[result.expectedTermAndState().length - 1].term, + deserialized.termsStats().maxTerm()); + + for (var x : result.expectedTermAndState()) { + IntBlockTermState expectedState = x.state; + IntBlockTermState actualState = deserialized.getTermState(x.term); + if (expectedState.singletonDocID != -1) { + assertEquals(expectedState.singletonDocID, actualState.singletonDocID); + } else { + assertEquals(expectedState.docStartFP, actualState.docStartFP); + } + assertEquals(expectedState.docFreq, actualState.docFreq); + if (result.indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) { + assertEquals(expectedState.totalTermFreq, actualState.totalTermFreq); + } + assertEquals(expectedState.skipOffset, actualState.skipOffset); + if (result.indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + assertEquals(expectedState.posStartFP, actualState.posStartFP); + assertEquals(expectedState.lastPosBlockOffset, actualState.lastPosBlockOffset); + } + if (result.indexOptions.ordinal() + >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { + assertEquals(expectedState.payStartFP, actualState.payStartFP); + } + } + } + + private ExpectedResults indexOneField( + IndexOutput metaOut, IndexOutput termIndexOut, TermDataOutputProvider outputProvider) + throws IOException { + int fieldNumber = nextFieldNumber++; + IndexOptions indexOptions = + IndexOptions.values()[random().nextInt(1, IndexOptions.values().length)]; + RandomAccessTermsDictWriter randomAccessTermsDictWriter = + new RandomAccessTermsDictWriter( + fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider); + + TermAndState[] expectedTermAndState = getRandoms(1000, 2000); + int expectedDocCount = random().nextInt(1, 2000); + + for (var x : expectedTermAndState) { + randomAccessTermsDictWriter.add(x.term, x.state); + } + randomAccessTermsDictWriter.finish(expectedDocCount); + return new ExpectedResults(fieldNumber, indexOptions, expectedTermAndState, expectedDocCount); + } + + private record ExpectedResults( + int fieldNumber, + IndexOptions indexOptions, + TermAndState[] expectedTermAndState, + int expectedDocCount) {} + + static TermAndState[] getRandoms(int size, int maxDoc) { IntBlockTermState lastTermState = null; ArrayList result = new ArrayList<>(size); From 622e56fe72c6a1eef6b11d6fb89ee62c33fbe98d Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 20 Nov 2023 10:37:25 -0800 Subject: [PATCH 31/57] Remove unused member in RandomAccessTermsDictWriter --- .../lucene99/randomaccess/RandomAccessTermsDictWriter.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java index fab30774c665..030d144e60d2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -27,8 +27,6 @@ /** Class to write the index files for one field. */ final class RandomAccessTermsDictWriter { /** externally provided * */ - private final int filedNumber; - private final IndexOptions indexOptions; private final DataOutput metaOutput; @@ -55,7 +53,6 @@ final class RandomAccessTermsDictWriter { DataOutput metaOutput, DataOutput indexOutput, TermDataOutputProvider termDataOutputProvider) { - this.filedNumber = filedNumber; this.indexOptions = indexOptions; this.metaOutput = metaOutput; this.indexOutput = indexOutput; From cf1104d675e3749614d0396b8b165c32ba9a064b Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 20 Nov 2023 14:38:25 -0800 Subject: [PATCH 32/57] Implement Lucene99RandomAccessTermsReader/Writer --- ...9RandomAccessDictionaryPostingsFormat.java | 21 +- .../Lucene99RandomAccessTermsReader.java | 170 ++++++++++++++- .../Lucene99RandomAccessTermsWriter.java | 202 +++++++++++++++++- .../RandomAccessTermsDictWriter.java | 1 + .../lucene99/randomaccess/TermsImpl.java | 2 +- .../randomaccess/TestTermDataWriter.java | 4 +- 6 files changed, 381 insertions(+), 19 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java index 59de10be73da..4b616486cad0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java @@ -20,8 +20,6 @@ import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; @@ -35,6 +33,15 @@ * @lucene.experimental */ public final class Lucene99RandomAccessDictionaryPostingsFormat extends PostingsFormat { + static String TERM_DICT_META_HEADER_CODEC_NAME = "RandomAccessTermsDict"; + static String TERM_INDEX_HEADER_CODEC_NAME = "RandomAccessTermsDictIndex"; + static String TERM_DATA_META_HEADER_CODEC_NAME_PREFIX = "RandomAccessTermsDictTermDataMeta"; + static String TERM_DATA_HEADER_CODEC_NAME_PREFIX = "RandomAccessTermsDictTermData"; + + static String TERM_DICT_META_INFO_EXTENSION = "tmeta"; + static String TERM_INDEX_EXTENSION = "tidx"; + static String TERM_DATA_META_EXTENSION_PREFIX = "tdm"; + static String TERM_DATA_EXTENSION_PREFIX = "tdd"; // Increment version to change it static final int VERSION_START = 0; @@ -42,7 +49,7 @@ public final class Lucene99RandomAccessDictionaryPostingsFormat extends Postings /** Creates {@code Lucene90RandomAccessDictionaryPostingsFormat} */ public Lucene99RandomAccessDictionaryPostingsFormat() { - super("Lucene90RandomAccess"); + super("Lucene99RandomAccess"); } @Override @@ -52,10 +59,10 @@ public String toString() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state); + Lucene99PostingsWriter postingsWriter = new Lucene99PostingsWriter(state); boolean success = false; try { - FieldsConsumer ret = new Lucene99RandomAccessTermsWriter(); + FieldsConsumer ret = new Lucene99RandomAccessTermsWriter(state, postingsWriter); success = true; return ret; } finally { @@ -67,10 +74,10 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene99PostingsReader(state); + Lucene99PostingsReader postingsReader = new Lucene99PostingsReader(state); boolean success = false; try { - FieldsProducer ret = new Lucene99RandomAccessTermsReader(); + FieldsProducer ret = new Lucene99RandomAccessTermsReader(postingsReader, state); success = true; return ret; } finally { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java index 79a63dccf265..fac2f6e7675e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java @@ -17,30 +17,188 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*; +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.TERM_DATA_HEADER_CODEC_NAME_PREFIX; + +import java.io.Closeable; import java.io.IOException; +import java.util.HashMap; import java.util.Iterator; +import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.Terms; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; + +final class Lucene99RandomAccessTermsReader extends FieldsProducer { + private final Lucene99PostingsReader postingsReader; + private final SegmentReadState segmentReadState; + + private final IndexFilesManager indexFilesManager; + + private final HashMap perFieldTermDict; + + Lucene99RandomAccessTermsReader( + Lucene99PostingsReader postingsReader, SegmentReadState segmentReadState) throws IOException { + this.postingsReader = postingsReader; + this.segmentReadState = segmentReadState; + this.indexFilesManager = new IndexFilesManager(); + this.perFieldTermDict = new HashMap<>(); + boolean success = false; + try { + int numFields = indexFilesManager.metaInfoIn.readVInt(); + assert numFields > 0; + for (int i = 0; i < numFields; i++) { + RandomAccessTermsDict termsDict = + RandomAccessTermsDict.deserialize( + fieldNumber -> segmentReadState.fieldInfos.fieldInfo(fieldNumber).getIndexOptions(), + indexFilesManager.metaInfoIn, + indexFilesManager.termIndexIn, + indexFilesManager); + FieldInfo fieldInfo = + segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber()); + String fieldName = fieldInfo.name; + perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict)); + } + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } -class Lucene99RandomAccessTermsReader extends FieldsProducer { @Override - public void close() throws IOException {} + public void close() throws IOException { + try { + IOUtils.close(indexFilesManager); + } finally { + // The per-field term dictionary would be invalid once the underlying index files have been + // closed. + perFieldTermDict.clear(); + } + } @Override - public void checkIntegrity() throws IOException {} + public void checkIntegrity() throws IOException { + // Integrity is already checked in indexFilesManager + } @Override public Iterator iterator() { - return null; + return perFieldTermDict.keySet().iterator(); } @Override public Terms terms(String field) throws IOException { - return null; + return perFieldTermDict.get(field); } @Override public int size() { - return 0; + return perFieldTermDict.size(); + } + + class IndexFilesManager implements RandomAccessTermsDict.TermDataInputProvider, Closeable { + private final IndexInput metaInfoIn; + + private final IndexInput termIndexIn; + + private final HashMap termDataInputPerType; + + public IndexFilesManager() throws IOException { + metaInfoIn = initMetaInfoInput(); + termIndexIn = initTermIndexInput(); + termDataInputPerType = new HashMap<>(); + } + + private IndexInput initMetaInfoInput() throws IOException { + final IndexInput tmp; + tmp = openAndChecksumIndexInputSafe(TERM_DICT_META_INFO_EXTENSION, false); + + checkHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME); + postingsReader.init(tmp, segmentReadState); + postingsReader.checkIntegrity(); + return tmp; + } + + private IndexInput initTermIndexInput() throws IOException { + final IndexInput tmp = openAndChecksumIndexInputSafe(TERM_INDEX_EXTENSION, true); + checkHeader(tmp, TERM_INDEX_HEADER_CODEC_NAME); + return tmp; + } + + private RandomAccessTermsDict.TermDataInput openTermDataInput(TermType termType) + throws IOException { + final IndexInput metaTmp; + final IndexInput dataTmp; + metaTmp = + openAndChecksumIndexInputSafe(TERM_DATA_META_EXTENSION_PREFIX + termType.getId(), true); + checkHeader(metaTmp, TERM_DATA_META_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + dataTmp = openAndChecksumIndexInputSafe(TERM_DATA_EXTENSION_PREFIX + termType.getId(), true); + checkHeader(dataTmp, TERM_DATA_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + return new RandomAccessTermsDict.TermDataInput(metaTmp, dataTmp); + } + + /** + * Open an IndexInput for a segment local name. The IndexInput will be closed if there was any + * error happened during open and verification. + */ + private IndexInput openAndChecksumIndexInputSafe( + String segmentLocalName, boolean needRandomAcees) throws IOException { + String name = + IndexFileNames.segmentFileName( + segmentReadState.segmentInfo.name, segmentReadState.segmentSuffix, segmentLocalName); + + boolean success = false; + IndexInput input = null; + try { + input = + segmentReadState.directory.openInput( + name, needRandomAcees ? IOContext.LOAD : IOContext.READ); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(input); + } + } + CodecUtil.checksumEntireFile(input); + return input; + } + + private void checkHeader(IndexInput input, String headerName) throws IOException { + CodecUtil.checkIndexHeader( + input, + headerName, + Lucene99RandomAccessDictionaryPostingsFormat.VERSION_START, + Lucene99RandomAccessDictionaryPostingsFormat.VERSION_CURRENT, + segmentReadState.segmentInfo.getId(), + segmentReadState.segmentSuffix); + } + + @Override + public RandomAccessTermsDict.TermDataInput getTermDataInputForType(TermType termType) + throws IOException { + RandomAccessTermsDict.TermDataInput current = termDataInputPerType.get(termType); + if (current == null) { + current = openTermDataInput(termType); + termDataInputPerType.put(termType, current); + } + return current; + } + + @Override + public void close() throws IOException { + IOUtils.close(metaInfoIn, termIndexIn); + for (var x : termDataInputPerType.values()) { + IOUtils.close(x.metadataInput(), x.dataInput()); + } + } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java index 87b68d2b9c63..b38c724839fa 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java @@ -16,15 +16,211 @@ */ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*; + import java.io.IOException; +import java.util.HashMap; +import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; + +final class Lucene99RandomAccessTermsWriter extends FieldsConsumer { + + private final SegmentWriteState segmentWriteState; + + private final Lucene99PostingsWriter postingsWriter; + + private final IndexFilesManager indexFilesManager; + + private boolean closed; + + public Lucene99RandomAccessTermsWriter( + SegmentWriteState segmentWriteState, Lucene99PostingsWriter postingsWriter) + throws IOException { + this.segmentWriteState = segmentWriteState; + this.postingsWriter = postingsWriter; + this.indexFilesManager = new IndexFilesManager(); + } -class Lucene99RandomAccessTermsWriter extends FieldsConsumer { @Override - public void write(Fields fields, NormsProducer norms) throws IOException {} + public void write(Fields fields, NormsProducer norms) throws IOException { + HashMap nonEmptyFields = new HashMap<>(); + for (String field : fields) { + Terms terms = fields.terms(field); + if (terms != null) { + nonEmptyFields.put(field, terms); + } + } + indexFilesManager.metaInfoOut.writeVInt(nonEmptyFields.size()); + + FixedBitSet docSeen = new FixedBitSet(segmentWriteState.segmentInfo.maxDoc()); + for (var entry : nonEmptyFields.entrySet()) { + TermsEnum termsEnum = entry.getValue().iterator(); + FieldInfo fieldInfo = segmentWriteState.fieldInfos.fieldInfo(entry.getKey()); + RandomAccessTermsDictWriter termsDictWriter = + new RandomAccessTermsDictWriter( + fieldInfo.number, + fieldInfo.getIndexOptions(), + indexFilesManager.metaInfoOut, + indexFilesManager.termIndexOut, + indexFilesManager); + postingsWriter.setField(fieldInfo); + + docSeen.clear(); + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + + IntBlockTermState termState = + (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms); + termsDictWriter.add(term, termState); + } + termsDictWriter.finish(docSeen.cardinality()); + } + } @Override - public void close() throws IOException {} + public void close() throws IOException { + if (closed) { + return; + } + indexFilesManager.close(); + closed = true; + } + + /** + * Manages the output index files needed. It handles adding indexing header on creation and footer + * upon closing. + */ + class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputProvider { + + private final IndexOutput metaInfoOut; + + private final IndexOutput termIndexOut; + + private final HashMap termDataOutputPerType; + + public IndexFilesManager() throws IOException { + metaInfoOut = initMetaInfoOutput(); + termIndexOut = initTermIndexOutput(); + // populate the per-TermType term data outputs on-demand. + termDataOutputPerType = new HashMap<>(); + } + + private IndexOutput initMetaInfoOutput() throws IOException { + final IndexOutput tmp; + tmp = getIndexOutputSafe(TERM_DICT_META_INFO_EXTENSION); + writeHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME); + postingsWriter.init(tmp, segmentWriteState); + return tmp; + } + + private IndexOutput initTermIndexOutput() throws IOException { + final IndexOutput tmp = getIndexOutputSafe(TERM_INDEX_EXTENSION); + writeHeader(tmp, TERM_INDEX_HEADER_CODEC_NAME); + return tmp; + } + + private TermDataOutput initTermDataOutput(TermType termType) throws IOException { + final IndexOutput metaTmp; + final IndexOutput dataTmp; + metaTmp = getIndexOutputSafe(TERM_DATA_META_EXTENSION_PREFIX + termType.getId()); + writeHeader(metaTmp, TERM_DATA_META_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + dataTmp = getIndexOutputSafe(TERM_DATA_EXTENSION_PREFIX + termType.getId()); + writeHeader(dataTmp, TERM_DATA_HEADER_CODEC_NAME_PREFIX + termType.getId()); + + return new TermDataOutput(metaTmp, dataTmp); + } + + /** + * Get an IndexOutput for a segment local name. The output will be closed if there was any error + * happened during creation. + */ + private IndexOutput getIndexOutputSafe(String segmentLocalName) throws IOException { + String name = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + segmentLocalName); + + boolean success = false; + IndexOutput output = null; + try { + output = segmentWriteState.directory.createOutput(name, segmentWriteState.context); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(output); + } + } + return output; + } + + private void writeHeader(IndexOutput output, String headerName) throws IOException { + CodecUtil.writeIndexHeader( + output, + headerName, + Lucene99RandomAccessDictionaryPostingsFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + } + + @Override + public TermDataOutput getTermDataOutputForType(TermType termType) throws IOException { + TermDataOutput current = termDataOutputPerType.get(termType); + if (current == null) { + current = initTermDataOutput(termType); + termDataOutputPerType.put(termType, current); + } + return current; + } + + /** + * Write footers for all created index files and close them. + * + *

Assume all index files are valid upto time of calling. + */ + void close() throws IOException { + boolean success = false; + try { + CodecUtil.writeFooter(metaInfoOut); + CodecUtil.writeFooter(termIndexOut); + for (var termDataOutput : termDataOutputPerType.values()) { + CodecUtil.writeFooter(termDataOutput.metadataOutput()); + CodecUtil.writeFooter(termDataOutput.dataOutput()); + } + success = true; + } finally { + if (success) { + IOUtils.close(metaInfoOut, termIndexOut); + for (var termDataOutput : termDataOutputPerType.values()) { + IOUtils.close(termDataOutput.metadataOutput()); + IOUtils.close(termDataOutput.dataOutput()); + } + } else { + IOUtils.closeWhileHandlingException(metaInfoOut, termIndexOut); + for (var termDataOutput : termDataOutputPerType.values()) { + IOUtils.closeWhileHandlingException( + termDataOutput.metadataOutput(), termDataOutput.dataOutput()); + } + } + } + } + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java index 030d144e60d2..e039eb8b7a49 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -28,6 +28,7 @@ final class RandomAccessTermsDictWriter { /** externally provided * */ private final IndexOptions indexOptions; + private final DataOutput metaOutput; private final DataOutput indexOutput; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index edbf1141457f..8a91ce7fd2c6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -30,7 +30,7 @@ final class TermsImpl extends Terms { private final RandomAccessTermsDict termsDict; - public TermsImpl(TermsStats stats, FieldInfo fieldInfo, RandomAccessTermsDict termsDict) { + public TermsImpl(FieldInfo fieldInfo, RandomAccessTermsDict termsDict) { this.fieldInfo = fieldInfo; this.termsDict = termsDict; } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java index 2ddc5b4ee67f..6b316330ecad 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java @@ -18,7 +18,6 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; -import java.util.Arrays; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TestTermStateCodecImpl.TermStateTestFixture; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker; @@ -28,6 +27,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.ArrayUtil; public class TestTermDataWriter extends LuceneTestCase { @@ -62,7 +62,7 @@ public void testWriterAndDeserialize() throws IOException { testFixture .codec() .encodeBlock( - Arrays.copyOfRange( + ArrayUtil.copyOfSubArray( testFixture.termStatesArray(), start, Math.min( From 4df3ad1fca0a5c1269ad126a1c22c742add05d66 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 20 Nov 2023 15:04:09 -0800 Subject: [PATCH 33/57] Move the package from sandbox to codecs Currently the tests can exercise PostingsFormats or other SPI provided implementation from sandbox. --- lucene/codecs/src/java/module-info.java | 4 +++- .../codecs/lucene99/randomaccess/ByteArrayByteSlice.java | 0 .../sandbox/codecs/lucene99/randomaccess/ByteSlice.java | 0 .../Lucene99RandomAccessDictionaryPostingsFormat.java | 0 .../randomaccess/Lucene99RandomAccessTermsReader.java | 0 .../randomaccess/Lucene99RandomAccessTermsWriter.java | 0 .../lucene99/randomaccess/RandomAccessInputByteSlice.java | 0 .../codecs/lucene99/randomaccess/RandomAccessTermsDict.java | 0 .../lucene99/randomaccess/RandomAccessTermsDictWriter.java | 0 .../lucene/sandbox/codecs/lucene99/randomaccess/TermData.java | 0 .../sandbox/codecs/lucene99/randomaccess/TermDataReader.java | 0 .../sandbox/codecs/lucene99/randomaccess/TermDataWriter.java | 0 .../sandbox/codecs/lucene99/randomaccess/TermStateCodec.java | 0 .../codecs/lucene99/randomaccess/TermStateCodecComponent.java | 0 .../codecs/lucene99/randomaccess/TermStateCodecImpl.java | 0 .../lucene/sandbox/codecs/lucene99/randomaccess/TermType.java | 0 .../sandbox/codecs/lucene99/randomaccess/TermsImpl.java | 0 .../sandbox/codecs/lucene99/randomaccess/TermsIndex.java | 0 .../codecs/lucene99/randomaccess/TermsIndexBuilder.java | 0 .../sandbox/codecs/lucene99/randomaccess/TermsStats.java | 0 .../codecs/lucene99/randomaccess/bitpacking/BitPacker.java | 0 .../lucene99/randomaccess/bitpacking/BitPackerImplBase.java | 0 .../codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java | 0 .../lucene99/randomaccess/bitpacking/BitUnpackerImpl.java | 0 .../lucene99/randomaccess/bitpacking/DataOutputBitPacker.java | 0 .../randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java | 0 .../codecs/lucene99/randomaccess/bitpacking/package-info.java | 0 .../sandbox/codecs/lucene99/randomaccess/package-info.java | 0 .../META-INF/services/org.apache.lucene.codecs.PostingsFormat | 1 + .../randomaccess/TestRandomAccessTermsDictWriter.java | 0 .../codecs/lucene99/randomaccess/TestTermDataWriter.java | 0 .../lucene99/randomaccess/TestTermStateCodecComponent.java | 0 .../codecs/lucene99/randomaccess/TestTermStateCodecImpl.java | 0 .../codecs/lucene99/randomaccess/TestTermsIndexBuilder.java | 0 .../sandbox/codecs/lucene99/randomaccess/TestTermsStats.java | 0 .../lucene99/randomaccess/bitpacking/BitPerBytePacker.java | 0 .../lucene99/randomaccess/bitpacking/TestBitPackerImpl.java | 0 .../lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java | 0 .../lucene99/randomaccess/bitpacking/ValueAndBitWidth.java | 0 lucene/sandbox/src/java/module-info.java | 2 -- 40 files changed, 4 insertions(+), 3 deletions(-) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java (100%) rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java (100%) rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java (100%) diff --git a/lucene/codecs/src/java/module-info.java b/lucene/codecs/src/java/module-info.java index 73f53fbf96b9..bc327c8debbd 100644 --- a/lucene/codecs/src/java/module-info.java +++ b/lucene/codecs/src/java/module-info.java @@ -33,7 +33,9 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat, org.apache.lucene.codecs.memory.FSTPostingsFormat, org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat, - org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat; + org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat, + org.apache.lucene.sandbox.codecs.lucene99.randomaccess + .Lucene99RandomAccessDictionaryPostingsFormat; provides org.apache.lucene.codecs.Codec with org.apache.lucene.codecs.simpletext.SimpleTextCodec; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java similarity index 100% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 09f2491c8012..e060907b8032 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -19,3 +19,4 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat org.apache.lucene.codecs.memory.FSTPostingsFormat org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat +org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java similarity index 100% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 45b66e7c353e..c51a25691ef2 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -22,8 +22,6 @@ exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; - exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess; - exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; exports org.apache.lucene.sandbox.document; exports org.apache.lucene.sandbox.queries; exports org.apache.lucene.sandbox.search; From f57ddbb7ad1336acef87dc95ffe5582ff1a95a07 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 20 Nov 2023 15:06:34 -0800 Subject: [PATCH 34/57] Fix bugs in writing Lucene99RandomAccessTermsW bug 1: negative sumTotalTermFeqs. bug 2: not closing the postings reader/writer. --- .../randomaccess/Lucene99RandomAccessTermsReader.java | 2 +- .../randomaccess/Lucene99RandomAccessTermsWriter.java | 8 +++++--- .../randomaccess/RandomAccessTermsDictWriter.java | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java index fac2f6e7675e..ed185751475e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java @@ -75,7 +75,7 @@ final class Lucene99RandomAccessTermsReader extends FieldsProducer { @Override public void close() throws IOException { try { - IOUtils.close(indexFilesManager); + IOUtils.close(indexFilesManager, postingsReader); } finally { // The per-field term dictionary would be invalid once the underlying index files have been // closed. diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java index b38c724839fa..192702b7580d 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java @@ -18,6 +18,7 @@ import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*; +import java.io.Closeable; import java.io.IOException; import java.util.HashMap; import org.apache.lucene.codecs.CodecUtil; @@ -99,7 +100,8 @@ public void close() throws IOException { if (closed) { return; } - indexFilesManager.close(); + IOUtils.close(indexFilesManager, postingsWriter); + closed = true; } @@ -107,7 +109,7 @@ public void close() throws IOException { * Manages the output index files needed. It handles adding indexing header on creation and footer * upon closing. */ - class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputProvider { + class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputProvider, Closeable { private final IndexOutput metaInfoOut; @@ -196,7 +198,7 @@ public TermDataOutput getTermDataOutputForType(TermType termType) throws IOExcep * *

Assume all index files are valid upto time of calling. */ - void close() throws IOException { + public void close() throws IOException { boolean success = false; try { CodecUtil.writeFooter(metaInfoOut); diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java index e039eb8b7a49..6ea363bb2a0d 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -154,7 +154,9 @@ static final class TermStatsTracker { void recordTerm(IntBlockTermState termState) { size += 1; sumDocFreq += termState.docFreq; - sumTotalTermFreq += termState.totalTermFreq; + if (termState.totalTermFreq > 0) { + sumTotalTermFreq += termState.totalTermFreq; + } } void setDocCount(int docCount) { From c66808dcde15e2fa3e1447e313d9eb7396ce0ccf Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Tue, 21 Nov 2023 15:53:50 -0800 Subject: [PATCH 35/57] Implement TermsEnum for Lucene99RandomAccess --- lucene/codecs/src/java/module-info.java | 2 + .../Lucene99RandomAccessTermsReader.java | 20 ++- .../Lucene99RandomAccessTermsWriter.java | 6 +- .../randomaccess/RandomAccessTermsDict.java | 23 ++- .../RandomAccessTermsDictWriter.java | 34 +++- .../lucene99/randomaccess/TermData.java | 29 +++- .../lucene99/randomaccess/TermDataReader.java | 20 ++- .../randomaccess/TermStateCodecImpl.java | 10 +- .../lucene99/randomaccess/TermsImpl.java | 164 +++++++++++++++++- .../lucene99/randomaccess/TermsIndex.java | 16 +- .../lucene99/randomaccess/TermsStats.java | 26 ++- .../bitpacking/BitUnpackerImpl.java | 4 + .../TestRandomAccessTermsDictWriter.java | 22 ++- .../randomaccess/TestTermDataWriter.java | 12 +- .../randomaccess/TestTermStateCodecImpl.java | 36 +++- 15 files changed, 355 insertions(+), 69 deletions(-) diff --git a/lucene/codecs/src/java/module-info.java b/lucene/codecs/src/java/module-info.java index bc327c8debbd..a128950ddb56 100644 --- a/lucene/codecs/src/java/module-info.java +++ b/lucene/codecs/src/java/module-info.java @@ -26,6 +26,8 @@ exports org.apache.lucene.codecs.simpletext; exports org.apache.lucene.codecs.uniformsplit; exports org.apache.lucene.codecs.uniformsplit.sharedterms; + exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat, diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java index ed185751475e..ab3285af297f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java @@ -29,6 +29,7 @@ import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.Terms; import org.apache.lucene.store.IOContext; @@ -56,14 +57,25 @@ final class Lucene99RandomAccessTermsReader extends FieldsProducer { for (int i = 0; i < numFields; i++) { RandomAccessTermsDict termsDict = RandomAccessTermsDict.deserialize( - fieldNumber -> segmentReadState.fieldInfos.fieldInfo(fieldNumber).getIndexOptions(), + new RandomAccessTermsDict.IndexOptionsProvider() { + @Override + public IndexOptions getIndexOptions(int fieldNumber) { + return segmentReadState.fieldInfos.fieldInfo(fieldNumber).getIndexOptions(); + } + + @Override + public boolean hasPayloads(int fieldNumber) { + return segmentReadState.fieldInfos.fieldInfo(fieldNumber).hasPayloads(); + } + }, indexFilesManager.metaInfoIn, indexFilesManager.termIndexIn, indexFilesManager); FieldInfo fieldInfo = segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber()); String fieldName = fieldInfo.name; - perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict)); + perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict, postingsReader)); + success = true; } } finally { if (!success) { @@ -151,7 +163,7 @@ private RandomAccessTermsDict.TermDataInput openTermDataInput(TermType termType) * error happened during open and verification. */ private IndexInput openAndChecksumIndexInputSafe( - String segmentLocalName, boolean needRandomAcees) throws IOException { + String segmentLocalName, boolean needRandomAccess) throws IOException { String name = IndexFileNames.segmentFileName( segmentReadState.segmentInfo.name, segmentReadState.segmentSuffix, segmentLocalName); @@ -161,7 +173,7 @@ private IndexInput openAndChecksumIndexInputSafe( try { input = segmentReadState.directory.openInput( - name, needRandomAcees ? IOContext.LOAD : IOContext.READ); + name, needRandomAccess ? IOContext.LOAD : IOContext.READ); success = true; } finally { if (!success) { diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java index 192702b7580d..bc6aebf1a8de 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java @@ -75,6 +75,7 @@ public void write(Fields fields, NormsProducer norms) throws IOException { new RandomAccessTermsDictWriter( fieldInfo.number, fieldInfo.getIndexOptions(), + fieldInfo.hasPayloads(), indexFilesManager.metaInfoOut, indexFilesManager.termIndexOut, indexFilesManager); @@ -89,7 +90,10 @@ public void write(Fields fields, NormsProducer norms) throws IOException { IntBlockTermState termState = (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms); - termsDictWriter.add(term, termState); + // TermState can be null + if (termState != null) { + termsDictWriter.add(term, termState); + } } termsDictWriter.finish(docSeen.cardinality()); } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java index 39947f9ff78c..1d1c3e194f40 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java @@ -26,11 +26,14 @@ /** A term dictionary that offer random-access to read a specific term */ record RandomAccessTermsDict( - TermsStats termsStats, TermsIndex termsIndex, TermDataReader termDataReader) { + TermsStats termsStats, + TermsIndex termsIndex, + TermDataReader termDataReader, + IndexOptions indexOptions) { IntBlockTermState getTermState(BytesRef term) throws IOException { TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term); - return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord()); + return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord(), indexOptions); } static RandomAccessTermsDict deserialize( @@ -43,17 +46,21 @@ static RandomAccessTermsDict deserialize( // (1) deserialize field stats TermsStats termsStats = TermsStats.deserialize(metaInput); IndexOptions indexOptions = indexOptionsProvider.getIndexOptions(termsStats.fieldNumber()); + boolean hasPayloads = indexOptionsProvider.hasPayloads(termsStats.fieldNumber()); // (2) deserialize terms index - TermsIndex termsIndex = - TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true); + TermsIndex termsIndex = null; + if (termsStats.size() > 0) { + termsIndex = TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true); + } // (3) deserialize all the term data by each TermType // (3.1) number of unique TermType this field has int numTermTypes = metaInput.readByte(); // (3.2) read per TermType - TermDataReader.Builder termDataReaderBuilder = new TermDataReader.Builder(indexOptions); + TermDataReader.Builder termDataReaderBuilder = + new TermDataReader.Builder(indexOptions, hasPayloads); for (int i = 0; i < numTermTypes; i++) { TermType termType = TermType.fromId(metaInput.readByte()); TermDataInput termDataInput = termDataInputProvider.getTermDataInputForType(termType); @@ -61,13 +68,15 @@ static RandomAccessTermsDict deserialize( termType, metaInput, termDataInput.metadataInput, termDataInput.dataInput); } - return new RandomAccessTermsDict(termsStats, termsIndex, termDataReaderBuilder.build()); + return new RandomAccessTermsDict( + termsStats, termsIndex, termDataReaderBuilder.build(), indexOptions); } - @FunctionalInterface interface IndexOptionsProvider { IndexOptions getIndexOptions(int fieldNumber); + + boolean hasPayloads(int fieldNumber); } record TermDataInput(IndexInput metadataInput, IndexInput dataInput) {} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java index 6ea363bb2a0d..6a8a4a6a5f74 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -23,14 +23,15 @@ import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; /** Class to write the index files for one field. */ final class RandomAccessTermsDictWriter { /** externally provided * */ private final IndexOptions indexOptions; + private final boolean hasPayloads; private final DataOutput metaOutput; - private final DataOutput indexOutput; private final TermDataOutputProvider termDataOutputProvider; @@ -46,15 +47,17 @@ final class RandomAccessTermsDictWriter { private final TermStatsTracker termStatsTracker; - private BytesRef previousTerm; + private BytesRefBuilder previousTerm; RandomAccessTermsDictWriter( int filedNumber, IndexOptions indexOptions, + boolean hasPayloads, DataOutput metaOutput, DataOutput indexOutput, TermDataOutputProvider termDataOutputProvider) { this.indexOptions = indexOptions; + this.hasPayloads = hasPayloads; this.metaOutput = metaOutput; this.indexOutput = indexOutput; this.termDataOutputProvider = termDataOutputProvider; @@ -66,9 +69,22 @@ void add(BytesRef term, IntBlockTermState termState) throws IOException { if (previousTerm == null) { // first term, which is also the minimum term termStatsTracker.setMinTerm(term); + previousTerm = new BytesRefBuilder(); + } + + /* There is interesting conventions to follow... + *

+     *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+     *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+     * 
+ */ + // for field that do not have freq enabled, as if each posting only has one occurrence. + if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) { + termState.totalTermFreq = termState.docFreq; } + termStatsTracker.recordTerm(termState); - previousTerm = term; + previousTerm.copyBytes(term); termsIndexBuilder.addTerm(term, termType); TermDataWriter termDataWriter = getTermDataWriterForType(termType); termDataWriter.addTermState(termState); @@ -82,7 +98,7 @@ private TermDataWriter getTermDataWriterForType(TermType termType) throws IOExce TermDataOutput termDataOutput = getTermDataOutput(termType); TermDataWriter termDataWriter = new TermDataWriter( - TermStateCodecImpl.getCodec(termType, indexOptions), + TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads), termDataOutput.metadataOutput(), termDataOutput.dataOutput()); termDataWriterPerType[termType.getId()] = termDataWriter; @@ -99,7 +115,9 @@ private TermDataOutput getTermDataOutput(TermType termType) throws IOException { void finish(int docCount) throws IOException { // finish up TermsStats for this field - termStatsTracker.setMaxTerm(previousTerm); + if (previousTerm != null) { + termStatsTracker.setMaxTerm(previousTerm.toBytesRef()); + } termStatsTracker.setDocCount(docCount); TermsStats termsStats = termStatsTracker.finish(); // (1) Write field metadata @@ -164,16 +182,14 @@ void setDocCount(int docCount) { } void setMinTerm(BytesRef minTerm) { - this.minTerm = minTerm; + this.minTerm = BytesRef.deepCopyOf(minTerm); } void setMaxTerm(BytesRef maxTerm) { - this.maxTerm = maxTerm; + this.maxTerm = BytesRef.deepCopyOf(maxTerm); } TermsStats finish() { - assert docCount > 0 && minTerm != null && maxTerm != null; - return new TermsStats( fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm); } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java index 9c74ffc83835..3860ba1a3f4b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -22,16 +22,18 @@ import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.BytesRef; /** * Holds the bit-packed {@link IntBlockTermState} for a given {@link * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType} */ -record TermData(ByteSlice metadata, ByteSlice data) { +record TermData(ByteSliceProvider metadataProvider, ByteSliceProvider dataProvider) { IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException { + var metadata = metadataProvider.newByteSlice(); + var data = dataProvider.newByteSlice(); + long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); long dataStartPos = metadata.getLong(metadataStartPos); @@ -71,21 +73,30 @@ static TermData deserializeOnHeap( metadataInput.readBytes(metadataBytes, 0, metadataBytes.length); dataInput.readBytes(dataBytes, 0, dataBytes.length); - return new TermData(new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes)); + return new TermData( + () -> new ByteArrayByteSlice(metadataBytes), () -> new ByteArrayByteSlice(dataBytes)); } static TermData deserializeOffHeap( DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException { - long metadataSize = metaInput.readVLong(); - long dataSize = metaInput.readVLong(); + final long metadataSize = metaInput.readVLong(); + final long dataSize = metaInput.readVLong(); + + final long metadataStart = metadataInput.getFilePointer(); + final long dataStart = dataInput.getFilePointer(); - RandomAccessInput metadata = - metadataInput.randomAccessSlice(metadataInput.getFilePointer(), metadataSize); metadataInput.skipBytes(metadataSize); - RandomAccessInput data = dataInput.randomAccessSlice(dataInput.getFilePointer(), dataSize); dataInput.skipBytes(dataSize); return new TermData( - new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data)); + () -> + new RandomAccessInputByteSlice( + metadataInput.randomAccessSlice(metadataStart, metadataSize)), + () -> new RandomAccessInputByteSlice(dataInput.randomAccessSlice(dataStart, dataSize))); + } + + @FunctionalInterface + interface ByteSliceProvider { + ByteSlice newByteSlice() throws IOException; } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java index 3a7ebd1e8a7c..fd5a44fc76b1 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java @@ -29,7 +29,8 @@ */ record TermDataReader(TermDataAndCodec[] termDataAndCodecs) { - IntBlockTermState getTermState(TermType termType, long ord) throws IOException { + IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions) + throws IOException { assert termDataAndCodecs[termType.getId()] != null; var dataAndCodec = termDataAndCodecs[termType.getId()]; IntBlockTermState termState = dataAndCodec.termData.getTermState(dataAndCodec.codec, ord); @@ -46,22 +47,35 @@ IntBlockTermState getTermState(TermType termType, long ord) throws IOException { termState.lastPosBlockOffset = -1; } + /* There is interesting conventions to follow... + *
+     *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+     *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+     * 
+ */ + // for field that do not have freq enabled, as if each posting only has one occurrence. + if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) { + termState.totalTermFreq = termState.docFreq; + } + return termState; } static class Builder { final IndexOptions indexOptions; + final boolean hasPayloads; final TermDataAndCodec[] termDataAndCodecs = new TermDataAndCodec[TermType.NUM_TOTAL_TYPES]; - Builder(IndexOptions indexOptions) { + Builder(IndexOptions indexOptions, boolean hasPayloads) { this.indexOptions = indexOptions; + this.hasPayloads = hasPayloads; } void readOne( TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn) throws IOException { TermData termData = TermData.deserializeOffHeap(metaIn, termMetadataIn, termDataIn); - TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions); + TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads); termDataAndCodecs[termType.getId()] = new TermDataAndCodec(termData, codec); } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index 734e24a7a057..d1a8392a37a9 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -66,12 +66,17 @@ private static int getMetadataLength(TermStateCodecComponent component) { return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0); } - public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexOptions) { + public static TermStateCodecImpl getCodec( + TermType termType, IndexOptions indexOptions, boolean hasPayloads) { assert indexOptions.ordinal() > IndexOptions.NONE.ordinal(); // A term can't have skip data (has more than one block's worth of doc), // while having a singleton doc at the same time! assert !(termType.hasSkipData() && termType.hasSingletonDoc()); + // Can't have payload for index options that is less than POSITIONS + assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal() + || !hasPayloads; + ArrayList components = new ArrayList<>(); // handle docs and docFreq if (termType.hasSingletonDoc()) { @@ -92,6 +97,9 @@ public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexO // handle positions if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { components.add(PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(PayloadStartFP.INSTANCE); + } if (termType.hasLastPositionBlockOffset()) { components.add(LastPositionBlockOffset.INSTANCE); } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index 8a91ce7fd2c6..36e861aaf6f8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -18,21 +18,33 @@ package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.fst.BytesRefFSTEnum; final class TermsImpl extends Terms { private final FieldInfo fieldInfo; private final RandomAccessTermsDict termsDict; - public TermsImpl(FieldInfo fieldInfo, RandomAccessTermsDict termsDict) { + private final Lucene99PostingsReader lucene99PostingsReader; + + public TermsImpl( + FieldInfo fieldInfo, + RandomAccessTermsDict termsDict, + Lucene99PostingsReader lucene99PostingsReader) { this.fieldInfo = fieldInfo; this.termsDict = termsDict; + this.lucene99PostingsReader = lucene99PostingsReader; } @Override @@ -89,13 +101,149 @@ public BytesRef getMax() throws IOException { @Override public TermsEnum iterator() throws IOException { - // TODO: implement me - return null; + if (size() == 0) { + return TermsEnum.EMPTY; + } + return new RandomAccessTermsEnum(); } - @Override - public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - // TODO: implement me - return null; + // TODO: implement a more efficient version via FST + // @Override + // public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException + // { + // return null; + // } + + final class RandomAccessTermsEnum extends TermsEnum { + private AttributeSource attrs; + + private BytesRef term; + + private boolean isTermStateCurrent; + + private IntBlockTermState termState; + + private final BytesRefFSTEnum fstEnum; + + private BytesRefFSTEnum.InputOutput fstSeekState; + + // Only set when seekExact(term, state) is called, because that will update + // the termState but leave the fstSeekState out of sync. + // We need to re-seek in next() calls to catch up to that term. + private boolean needReSeekInNext; + + RandomAccessTermsEnum() { + termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); + fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst()); + } + + void updateTermStateIfNeeded() throws IOException { + if (!isTermStateCurrent && !needReSeekInNext) { + TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstSeekState.output); + termState = + termsDict + .termDataReader() + .getTermState(typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions()); + isTermStateCurrent = true; + } + } + + @Override + public AttributeSource attributes() { + if (attrs == null) { + attrs = new AttributeSource(); + } + return attrs; + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + fstSeekState = fstEnum.seekExact(text); + term = fstSeekState == null ? null : fstSeekState.input; + isTermStateCurrent = false; + needReSeekInNext = false; + return term != null; + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + fstSeekState = fstEnum.seekCeil(text); + term = fstSeekState == null ? null : fstSeekState.input; + isTermStateCurrent = false; + needReSeekInNext = false; + if (term == null) { + return SeekStatus.END; + } + return text.equals(term) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; + } + + @Override + public void seekExact(BytesRef target, TermState state) throws IOException { + if (!target.equals(term)) { + assert state instanceof IntBlockTermState; + termState.copyFrom(state); + term = BytesRef.deepCopyOf(target); + isTermStateCurrent = true; + needReSeekInNext = true; + } + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public int docFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.postings(fieldInfo, termState, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.impacts(fieldInfo, termState, flags); + } + + @Override + public TermState termState() throws IOException { + updateTermStateIfNeeded(); + return termState.clone(); + } + + @Override + public BytesRef next() throws IOException { + if (needReSeekInNext) { + fstSeekState = fstEnum.seekExact(term); + assert fstSeekState != null; + } + fstSeekState = fstEnum.next(); + term = fstSeekState == null ? null : fstSeekState.input; + isTermStateCurrent = false; + needReSeekInNext = false; + return term; + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException("By ord lookup not supported."); + } } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index d0a4c0c4c56b..9474a82bef78 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -30,15 +30,21 @@ record TermsIndex(FST fst) { TypeAndOrd getTerm(BytesRef term) throws IOException { long encoded = Util.get(fst, term); - TermType termType = TermType.fromId((int) ((encoded & 0b1110L) >>> 1)); - long ord = encoded >>> 4; - return new TypeAndOrd(termType, ord); + return decodeLong(encoded); } record TypeAndOrd(TermType termType, long ord) {} void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException { - fst.save(metaOut, dataOut); + if (fst != null) { + fst.save(metaOut, dataOut); + } + } + + static TypeAndOrd decodeLong(long encoded) { + TermType termType = TermType.fromId((int) ((encoded & 0b1110L) >>> 1)); + long ord = encoded >>> 4; + return new TypeAndOrd(termType, ord); } static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap) @@ -46,7 +52,7 @@ static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOf FST fst; if (loadOffHeap) { var fstStore = new OffHeapFSTStore(); - fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), fstStore); + fst = new FST<>(metaIn, dataIn.clone(), PositiveIntOutputs.getSingleton(), fstStore); dataIn.skipBytes(fstStore.size()); } else { fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton()); diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java index 0c65f2e04d39..b1881475f74e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java @@ -39,19 +39,27 @@ void serialize(DataOutput output) throws IOException { output.writeVLong(sumTotalTermFreq); output.writeVLong(sumDocFreq); output.writeVInt(docCount); - writeBytesRef(output, minTerm); - writeBytesRef(output, maxTerm); + if (minTerm != null) { + writeBytesRef(output, minTerm); + } + if (maxTerm != null) { + writeBytesRef(output, maxTerm); + } } static TermsStats deserialize(DataInput input) throws IOException { + int fieldNumber = input.readVInt(); + long size = input.readVLong(); + long sumTotalTermFreq = input.readVLong(); + long sumDocFreq = input.readVLong(); + int docCount = input.readVInt(); + BytesRef minTerm = null, maxTerm = null; + if (size > 0) { + minTerm = readBytesRef(input); + maxTerm = readBytesRef(input); + } return new TermsStats( - input.readVInt(), - input.readVLong(), - input.readVLong(), - input.readVLong(), - input.readVInt(), - readBytesRef(input), - readBytesRef(input)); + fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm); } static void writeBytesRef(DataOutput output, BytesRef bytes) throws IOException { diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java index 84704c0b8787..d3a5ab210776 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java @@ -30,6 +30,10 @@ public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) { assert (startBitIndex + bitWidth) <= bytesRef.length * 8; assert bitWidth < 64; + if (bitWidth == 0) { + return 0; + } + int firstByteIndex = startBitIndex / 8; int numBitsToExcludeInFirstByte = startBitIndex % 8; int lastByteIndex = (startBitIndex + bitWidth) / 8; diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java index d4b1f94aab04..226a4700813c 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java @@ -108,7 +108,17 @@ private static void assertDeserializedMatchingExpected( throws IOException { RandomAccessTermsDict deserialized = RandomAccessTermsDict.deserialize( - _fieldNumber -> result.indexOptions(), + new RandomAccessTermsDict.IndexOptionsProvider() { + @Override + public IndexOptions getIndexOptions(int fieldNumber) { + return result.indexOptions; + } + + @Override + public boolean hasPayloads(int fieldNumber) { + return result.hasPayloads(); + } + }, metaInput, termIndexInput, termDataInputProvider); @@ -158,9 +168,13 @@ private ExpectedResults indexOneField( int fieldNumber = nextFieldNumber++; IndexOptions indexOptions = IndexOptions.values()[random().nextInt(1, IndexOptions.values().length)]; + boolean hasPayloads = random().nextBoolean(); + if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + hasPayloads = false; + } RandomAccessTermsDictWriter randomAccessTermsDictWriter = new RandomAccessTermsDictWriter( - fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider); + fieldNumber, indexOptions, hasPayloads, metaOut, termIndexOut, outputProvider); TermAndState[] expectedTermAndState = getRandoms(1000, 2000); int expectedDocCount = random().nextInt(1, 2000); @@ -169,12 +183,14 @@ private ExpectedResults indexOneField( randomAccessTermsDictWriter.add(x.term, x.state); } randomAccessTermsDictWriter.finish(expectedDocCount); - return new ExpectedResults(fieldNumber, indexOptions, expectedTermAndState, expectedDocCount); + return new ExpectedResults( + fieldNumber, indexOptions, hasPayloads, expectedTermAndState, expectedDocCount); } private record ExpectedResults( int fieldNumber, IndexOptions indexOptions, + boolean hasPayloads, TermAndState[] expectedTermAndState, int expectedDocCount) {} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java index 6b316330ecad..e0cd887c10c6 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java @@ -73,7 +73,7 @@ public void testWriterAndDeserialize() throws IOException { } ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes()); ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata); - TermData expected = new TermData(expectedMetadataSlice, expectedDataSlice); + TermData expected = new TermData(() -> expectedMetadataSlice, () -> expectedDataSlice); IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT); IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT); @@ -81,13 +81,15 @@ public void testWriterAndDeserialize() throws IOException { TermData actual = TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); - assertByteSlice(expected.metadata(), actual.metadata()); - assertByteSlice(expected.data(), actual.data()); + assertByteSlice( + expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice()); + assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice()); testDecodeTermState(testFixture, actual); actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); - assertByteSlice(expected.metadata(), actual.metadata()); - assertByteSlice(expected.data(), actual.data()); + assertByteSlice( + expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice()); + assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice()); testDecodeTermState(testFixture, actual); metaIn.close(); diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java index db7630f1f35a..f9d1c416cda7 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java @@ -176,16 +176,24 @@ public void testGetCodec() { && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { continue; } - TermType termType = TermType.fromId(i); - var expected = getExpectedCodec(termType, indexOptions); - var got = TermStateCodecImpl.getCodec(termType, indexOptions); - assertEquals(expected, got); + for (int dice = 0; dice < 2; dice++) { + boolean hasPayloads = dice == 0; + if (hasPayloads + && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { + continue; + } + TermType termType = TermType.fromId(i); + var expected = getExpectedCodec(termType, indexOptions, hasPayloads); + var got = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads); + assertEquals(expected, got); + } } } } // Enumerate the expected Codec we get for (TermType, IndexOptions) pairs. - static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions indexOptions) { + static TermStateCodecImpl getExpectedCodec( + TermType termType, IndexOptions indexOptions, boolean hasPayloads) { ArrayList components = new ArrayList<>(); // Wish I can code this better in java... switch (termType.getId()) { @@ -201,6 +209,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { @@ -220,6 +231,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { @@ -240,6 +254,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) { components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } } if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { @@ -262,6 +279,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index components.add(TermStateCodecComponent.DocFreq.INSTANCE); components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { @@ -277,6 +297,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index components.add(TermStateCodecComponent.SingletonDocId.INSTANCE); components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { @@ -294,6 +317,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index components.add(TermStateCodecComponent.SkipOffset.INSTANCE); components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE); components.add(TermStateCodecComponent.PositionStartFP.INSTANCE); + if (hasPayloads) { + components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE); + } components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE); if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) { From ea572a31bd42c5f3a8901540e33afe63f9f71ba9 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Tue, 21 Nov 2023 23:54:34 -0800 Subject: [PATCH 36/57] Fix bugs found in tests 1. handle terms with no docs (due to deletes) 2. carefully handle create/open index files to make sure not files left unclosed even in case of errors. --- .../Lucene99RandomAccessTermsReader.java | 57 +++++-- .../Lucene99RandomAccessTermsWriter.java | 144 ++++++++++-------- .../lucene99/randomaccess/TermsImpl.java | 3 - 3 files changed, 118 insertions(+), 86 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java index ab3285af297f..4079b0e5d779 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java @@ -22,6 +22,7 @@ import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import org.apache.lucene.codecs.CodecUtil; @@ -44,14 +45,26 @@ final class Lucene99RandomAccessTermsReader extends FieldsProducer { private final HashMap perFieldTermDict; + private boolean closed; + Lucene99RandomAccessTermsReader( Lucene99PostingsReader postingsReader, SegmentReadState segmentReadState) throws IOException { this.postingsReader = postingsReader; this.segmentReadState = segmentReadState; - this.indexFilesManager = new IndexFilesManager(); this.perFieldTermDict = new HashMap<>(); boolean success = false; + IndexFilesManager tmpIndexFilesManager = null; try { + boolean indexManagerInitSuccess = false; + try { + tmpIndexFilesManager = new IndexFilesManager(); + this.indexFilesManager = tmpIndexFilesManager; + indexManagerInitSuccess = true; + } finally { + if (!indexManagerInitSuccess) { + IOUtils.closeWhileHandlingException(tmpIndexFilesManager); + } + } int numFields = indexFilesManager.metaInfoIn.readVInt(); assert numFields > 0; for (int i = 0; i < numFields; i++) { @@ -71,12 +84,15 @@ public boolean hasPayloads(int fieldNumber) { indexFilesManager.metaInfoIn, indexFilesManager.termIndexIn, indexFilesManager); - FieldInfo fieldInfo = - segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber()); - String fieldName = fieldInfo.name; - perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict, postingsReader)); - success = true; + + if (termsDict.termsStats().size() > 0) { + FieldInfo fieldInfo = + segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber()); + String fieldName = fieldInfo.name; + perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict, postingsReader)); + } } + success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(this); @@ -86,11 +102,15 @@ public boolean hasPayloads(int fieldNumber) { @Override public void close() throws IOException { + if (closed) { + return; + } try { IOUtils.close(indexFilesManager, postingsReader); } finally { // The per-field term dictionary would be invalid once the underlying index files have been // closed. + closed = true; perFieldTermDict.clear(); } } @@ -116,22 +136,25 @@ public int size() { } class IndexFilesManager implements RandomAccessTermsDict.TermDataInputProvider, Closeable { - private final IndexInput metaInfoIn; + private IndexInput metaInfoIn; - private final IndexInput termIndexIn; + private IndexInput termIndexIn; private final HashMap termDataInputPerType; + private boolean closed; + + private final ArrayList openedInputs; + public IndexFilesManager() throws IOException { + termDataInputPerType = new HashMap<>(); + openedInputs = new ArrayList<>(); metaInfoIn = initMetaInfoInput(); termIndexIn = initTermIndexInput(); - termDataInputPerType = new HashMap<>(); } private IndexInput initMetaInfoInput() throws IOException { - final IndexInput tmp; - tmp = openAndChecksumIndexInputSafe(TERM_DICT_META_INFO_EXTENSION, false); - + final IndexInput tmp = openAndChecksumIndexInputSafe(TERM_DICT_META_INFO_EXTENSION, false); checkHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME); postingsReader.init(tmp, segmentReadState); postingsReader.checkIntegrity(); @@ -174,10 +197,11 @@ private IndexInput openAndChecksumIndexInputSafe( input = segmentReadState.directory.openInput( name, needRandomAccess ? IOContext.LOAD : IOContext.READ); + openedInputs.add(input); success = true; } finally { if (!success) { - IOUtils.closeWhileHandlingException(input); + IOUtils.closeWhileHandlingException(input, this); } } CodecUtil.checksumEntireFile(input); @@ -207,10 +231,11 @@ public RandomAccessTermsDict.TermDataInput getTermDataInputForType(TermType term @Override public void close() throws IOException { - IOUtils.close(metaInfoIn, termIndexIn); - for (var x : termDataInputPerType.values()) { - IOUtils.close(x.metadataInput(), x.dataInput()); + if (this.closed) { + return; } + this.closed = true; + IOUtils.close(openedInputs); } } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java index bc6aebf1a8de..3fd7fdcf111c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java @@ -20,6 +20,7 @@ import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; @@ -53,7 +54,17 @@ public Lucene99RandomAccessTermsWriter( throws IOException { this.segmentWriteState = segmentWriteState; this.postingsWriter = postingsWriter; - this.indexFilesManager = new IndexFilesManager(); + IndexFilesManager tmpIndexFilesManager = null; + boolean indexManagerInitSuccess = false; + try { + tmpIndexFilesManager = new IndexFilesManager(); + this.indexFilesManager = tmpIndexFilesManager; + indexManagerInitSuccess = true; + } finally { + if (!indexManagerInitSuccess) { + IOUtils.closeWhileHandlingException(tmpIndexFilesManager, this); + } + } } @Override @@ -65,37 +76,48 @@ public void write(Fields fields, NormsProducer norms) throws IOException { nonEmptyFields.put(field, terms); } } - indexFilesManager.metaInfoOut.writeVInt(nonEmptyFields.size()); - - FixedBitSet docSeen = new FixedBitSet(segmentWriteState.segmentInfo.maxDoc()); - for (var entry : nonEmptyFields.entrySet()) { - TermsEnum termsEnum = entry.getValue().iterator(); - FieldInfo fieldInfo = segmentWriteState.fieldInfos.fieldInfo(entry.getKey()); - RandomAccessTermsDictWriter termsDictWriter = - new RandomAccessTermsDictWriter( - fieldInfo.number, - fieldInfo.getIndexOptions(), - fieldInfo.hasPayloads(), - indexFilesManager.metaInfoOut, - indexFilesManager.termIndexOut, - indexFilesManager); - postingsWriter.setField(fieldInfo); - - docSeen.clear(); - while (true) { - BytesRef term = termsEnum.next(); - if (term == null) { - break; - } + boolean success = false; + try { + indexFilesManager.writeAllHeaders(); + postingsWriter.init(indexFilesManager.metaInfoOut, segmentWriteState); + indexFilesManager.metaInfoOut.writeVInt(nonEmptyFields.size()); + + FixedBitSet docSeen = new FixedBitSet(segmentWriteState.segmentInfo.maxDoc()); + for (var entry : nonEmptyFields.entrySet()) { + TermsEnum termsEnum = entry.getValue().iterator(); + FieldInfo fieldInfo = segmentWriteState.fieldInfos.fieldInfo(entry.getKey()); + RandomAccessTermsDictWriter termsDictWriter = + new RandomAccessTermsDictWriter( + fieldInfo.number, + fieldInfo.getIndexOptions(), + fieldInfo.hasPayloads(), + indexFilesManager.metaInfoOut, + indexFilesManager.termIndexOut, + indexFilesManager); + postingsWriter.setField(fieldInfo); + + docSeen.clear(); + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } - IntBlockTermState termState = - (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms); - // TermState can be null - if (termState != null) { - termsDictWriter.add(term, termState); + IntBlockTermState termState = + (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms); + // TermState can be null + if (termState != null) { + termsDictWriter.add(term, termState); + } } + termsDictWriter.finish(docSeen.cardinality()); + } + indexFilesManager.writeAllFooters(); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); } - termsDictWriter.finish(docSeen.cardinality()); } } @@ -121,25 +143,24 @@ class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputPro private final HashMap termDataOutputPerType; + private boolean closed; + + private final ArrayList openedOutputs; + public IndexFilesManager() throws IOException { - metaInfoOut = initMetaInfoOutput(); - termIndexOut = initTermIndexOutput(); // populate the per-TermType term data outputs on-demand. termDataOutputPerType = new HashMap<>(); + openedOutputs = new ArrayList<>(); + metaInfoOut = initMetaInfoOutput(); + termIndexOut = initTermIndexOutput(); } private IndexOutput initMetaInfoOutput() throws IOException { - final IndexOutput tmp; - tmp = getIndexOutputSafe(TERM_DICT_META_INFO_EXTENSION); - writeHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME); - postingsWriter.init(tmp, segmentWriteState); - return tmp; + return getIndexOutputSafe(TERM_DICT_META_INFO_EXTENSION); } private IndexOutput initTermIndexOutput() throws IOException { - final IndexOutput tmp = getIndexOutputSafe(TERM_INDEX_EXTENSION); - writeHeader(tmp, TERM_INDEX_HEADER_CODEC_NAME); - return tmp; + return getIndexOutputSafe(TERM_INDEX_EXTENSION); } private TermDataOutput initTermDataOutput(TermType termType) throws IOException { @@ -169,6 +190,7 @@ private IndexOutput getIndexOutputSafe(String segmentLocalName) throws IOExcepti IndexOutput output = null; try { output = segmentWriteState.directory.createOutput(name, segmentWriteState.context); + openedOutputs.add(output); success = true; } finally { if (!success) { @@ -187,6 +209,17 @@ private void writeHeader(IndexOutput output, String headerName) throws IOExcepti segmentWriteState.segmentSuffix); } + private void writeAllHeaders() throws IOException { + writeHeader(metaInfoOut, TERM_DICT_META_HEADER_CODEC_NAME); + writeHeader(termIndexOut, TERM_INDEX_HEADER_CODEC_NAME); + } + + private void writeAllFooters() throws IOException { + for (var x : openedOutputs) { + CodecUtil.writeFooter(x); + } + } + @Override public TermDataOutput getTermDataOutputForType(TermType termType) throws IOException { TermDataOutput current = termDataOutputPerType.get(termType); @@ -197,36 +230,13 @@ public TermDataOutput getTermDataOutputForType(TermType termType) throws IOExcep return current; } - /** - * Write footers for all created index files and close them. - * - *

Assume all index files are valid upto time of calling. - */ + @Override public void close() throws IOException { - boolean success = false; - try { - CodecUtil.writeFooter(metaInfoOut); - CodecUtil.writeFooter(termIndexOut); - for (var termDataOutput : termDataOutputPerType.values()) { - CodecUtil.writeFooter(termDataOutput.metadataOutput()); - CodecUtil.writeFooter(termDataOutput.dataOutput()); - } - success = true; - } finally { - if (success) { - IOUtils.close(metaInfoOut, termIndexOut); - for (var termDataOutput : termDataOutputPerType.values()) { - IOUtils.close(termDataOutput.metadataOutput()); - IOUtils.close(termDataOutput.dataOutput()); - } - } else { - IOUtils.closeWhileHandlingException(metaInfoOut, termIndexOut); - for (var termDataOutput : termDataOutputPerType.values()) { - IOUtils.closeWhileHandlingException( - termDataOutput.metadataOutput(), termDataOutput.dataOutput()); - } - } + if (this.closed) { + return; } + this.closed = true; + IOUtils.close(openedOutputs); } } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index 36e861aaf6f8..c4a7aff819c7 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -101,9 +101,6 @@ public BytesRef getMax() throws IOException { @Override public TermsEnum iterator() throws IOException { - if (size() == 0) { - return TermsEnum.EMPTY; - } return new RandomAccessTermsEnum(); } From 5a8efd34de3a235ba02f85561247816fa2eaa6cf Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Wed, 22 Nov 2023 00:49:57 -0800 Subject: [PATCH 37/57] Reduce index cloning calls when looking up terms Only clone per-TermEnum. Note: I had to increase the threshold as this PostingsFormat has more index files per segement Before: java.lang.AssertionError: too many calls to IndexInput.clone during TermRangeQuery: 2878 After: 70 --- .../randomaccess/ByteSliceProvider.java | 26 +++++++ .../randomaccess/RandomAccessTermsDict.java | 5 +- .../lucene99/randomaccess/TermData.java | 55 +-------------- .../randomaccess/TermDataProvider.java | 67 +++++++++++++++++++ .../lucene99/randomaccess/TermDataReader.java | 42 +++++++++--- .../lucene99/randomaccess/TermsImpl.java | 11 ++- .../TestRandomAccessTermsDictWriter.java | 3 +- .../randomaccess/TestTermDataWriter.java | 31 +++++---- .../lucene/index/TestForTooMuchCloning.java | 2 +- 9 files changed, 158 insertions(+), 84 deletions(-) create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java new file mode 100644 index 000000000000..7d18abc5e0a4 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; + +/** Factory of {@link ByteSlice} */ +@FunctionalInterface +interface ByteSliceProvider { + ByteSlice newByteSlice() throws IOException; +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java index 1d1c3e194f40..712c832d93c5 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java @@ -31,9 +31,10 @@ record RandomAccessTermsDict( TermDataReader termDataReader, IndexOptions indexOptions) { - IntBlockTermState getTermState(BytesRef term) throws IOException { + IntBlockTermState getTermState(BytesRef term, TermData[] termDataPerType) throws IOException { TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term); - return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord(), indexOptions); + return termDataReader.getTermState( + typeAndOrd.termType(), typeAndOrd.ord(), indexOptions, termDataPerType); } static RandomAccessTermsDict deserialize( diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java index 3860ba1a3f4b..6eba1a0cd51c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -20,20 +20,14 @@ import java.io.IOException; import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; /** * Holds the bit-packed {@link IntBlockTermState} for a given {@link * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType} */ -record TermData(ByteSliceProvider metadataProvider, ByteSliceProvider dataProvider) { - +record TermData(ByteSlice metadata, ByteSlice data) { IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException { - var metadata = metadataProvider.newByteSlice(); - var data = dataProvider.newByteSlice(); - long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); long dataStartPos = metadata.getLong(metadataStartPos); @@ -52,51 +46,4 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex); } - - static TermData deserializeOnHeap( - DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException { - long metadataSize = metaInput.readVLong(); - long dataSize = metaInput.readVLong(); - - if (metadataSize > Integer.MAX_VALUE) { - throw new IllegalArgumentException( - "Metadata size it too large to store on heap. Must be less than " + Integer.MAX_VALUE); - } - if (dataSize > Integer.MAX_VALUE) { - throw new IllegalArgumentException( - "Data size it too large to store on heap.Must be less than " + Integer.MAX_VALUE); - } - - byte[] metadataBytes = new byte[(int) metadataSize]; - byte[] dataBytes = new byte[(int) dataSize]; - - metadataInput.readBytes(metadataBytes, 0, metadataBytes.length); - dataInput.readBytes(dataBytes, 0, dataBytes.length); - - return new TermData( - () -> new ByteArrayByteSlice(metadataBytes), () -> new ByteArrayByteSlice(dataBytes)); - } - - static TermData deserializeOffHeap( - DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException { - final long metadataSize = metaInput.readVLong(); - final long dataSize = metaInput.readVLong(); - - final long metadataStart = metadataInput.getFilePointer(); - final long dataStart = dataInput.getFilePointer(); - - metadataInput.skipBytes(metadataSize); - dataInput.skipBytes(dataSize); - - return new TermData( - () -> - new RandomAccessInputByteSlice( - metadataInput.randomAccessSlice(metadataStart, metadataSize)), - () -> new RandomAccessInputByteSlice(dataInput.randomAccessSlice(dataStart, dataSize))); - } - - @FunctionalInterface - interface ByteSliceProvider { - ByteSlice newByteSlice() throws IOException; - } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java new file mode 100644 index 000000000000..130094016c5d --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; + +/** Factory class to produce instances of TermData */ +record TermDataProvider(ByteSliceProvider metadataProvider, ByteSliceProvider dataProvider) { + static TermDataProvider deserializeOnHeap( + DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException { + long metadataSize = metaInput.readVLong(); + long dataSize = metaInput.readVLong(); + + if (metadataSize > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "Metadata size it too large to store on heap. Must be less than " + Integer.MAX_VALUE); + } + if (dataSize > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "Data size it too large to store on heap.Must be less than " + Integer.MAX_VALUE); + } + + byte[] metadataBytes = new byte[(int) metadataSize]; + byte[] dataBytes = new byte[(int) dataSize]; + + metadataInput.readBytes(metadataBytes, 0, metadataBytes.length); + dataInput.readBytes(dataBytes, 0, dataBytes.length); + + return new TermDataProvider( + () -> new ByteArrayByteSlice(metadataBytes), () -> new ByteArrayByteSlice(dataBytes)); + } + + static TermDataProvider deserializeOffHeap( + DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException { + final long metadataSize = metaInput.readVLong(); + final long dataSize = metaInput.readVLong(); + + final long metadataStart = metadataInput.getFilePointer(); + final long dataStart = dataInput.getFilePointer(); + + metadataInput.skipBytes(metadataSize); + dataInput.skipBytes(dataSize); + + return new TermDataProvider( + () -> + new RandomAccessInputByteSlice( + metadataInput.randomAccessSlice(metadataStart, metadataSize)), + () -> new RandomAccessInputByteSlice(dataInput.randomAccessSlice(dataStart, dataSize))); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java index fd5a44fc76b1..7d9b701f9ab1 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java @@ -27,13 +27,17 @@ * Holds all {@link TermData} per {@link TermType} for a field. Also manages the proper codec needed * per TermType. */ -record TermDataReader(TermDataAndCodec[] termDataAndCodecs) { +record TermDataReader(TermDataProviderAndCodec[] termDataProviderAndCodecs) { - IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions) + IntBlockTermState getTermState( + TermType termType, long ord, IndexOptions indexOptions, TermData[] termDataPerType) throws IOException { - assert termDataAndCodecs[termType.getId()] != null; - var dataAndCodec = termDataAndCodecs[termType.getId()]; - IntBlockTermState termState = dataAndCodec.termData.getTermState(dataAndCodec.codec, ord); + assert termDataProviderAndCodecs[termType.getId()] != null; + assert termDataPerType.length == termDataProviderAndCodecs.length; + assert termDataPerType[termType.getId()] != null; + + var codec = termDataProviderAndCodecs[termType.getId()].codec; + IntBlockTermState termState = termDataPerType[termType.getId()].getTermState(codec, ord); // need to filling some default values for the term state // in order to meet the expectations of the postings reader @@ -61,10 +65,26 @@ IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOp return termState; } + TermData[] newPerTypeTermDataReference() throws IOException { + TermData[] result = new TermData[termDataProviderAndCodecs.length]; + for (int i = 0; i < result.length; i++) { + if (termDataProviderAndCodecs[i] == null) { + continue; + } + TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider; + result[i] = + new TermData( + termDataProvider.metadataProvider().newByteSlice(), + termDataProvider.dataProvider().newByteSlice()); + } + return result; + } + static class Builder { final IndexOptions indexOptions; final boolean hasPayloads; - final TermDataAndCodec[] termDataAndCodecs = new TermDataAndCodec[TermType.NUM_TOTAL_TYPES]; + final TermDataProviderAndCodec[] termDataProviderAndCodecs = + new TermDataProviderAndCodec[TermType.NUM_TOTAL_TYPES]; Builder(IndexOptions indexOptions, boolean hasPayloads) { this.indexOptions = indexOptions; @@ -74,15 +94,17 @@ static class Builder { void readOne( TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn) throws IOException { - TermData termData = TermData.deserializeOffHeap(metaIn, termMetadataIn, termDataIn); + TermDataProvider termDataProvider = + TermDataProvider.deserializeOffHeap(metaIn, termMetadataIn, termDataIn); TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads); - termDataAndCodecs[termType.getId()] = new TermDataAndCodec(termData, codec); + termDataProviderAndCodecs[termType.getId()] = + new TermDataProviderAndCodec(termDataProvider, codec); } TermDataReader build() { - return new TermDataReader(termDataAndCodecs); + return new TermDataReader(termDataProviderAndCodecs); } } - record TermDataAndCodec(TermData termData, TermStateCodec codec) {} + record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {} } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index c4a7aff819c7..c521cba8a8ce 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -129,9 +129,12 @@ final class RandomAccessTermsEnum extends TermsEnum { // We need to re-seek in next() calls to catch up to that term. private boolean needReSeekInNext; - RandomAccessTermsEnum() { + private TermData[] perTypeTermData; + + RandomAccessTermsEnum() throws IOException { termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst()); + perTypeTermData = termsDict.termDataReader().newPerTypeTermDataReference(); } void updateTermStateIfNeeded() throws IOException { @@ -140,7 +143,11 @@ void updateTermStateIfNeeded() throws IOException { termState = termsDict .termDataReader() - .getTermState(typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions()); + .getTermState( + typeAndOrd.termType(), + typeAndOrd.ord(), + fieldInfo.getIndexOptions(), + perTypeTermData); isTermStateCurrent = true; } } diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java index 226a4700813c..b02d4de0cebf 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java @@ -138,9 +138,10 @@ public boolean hasPayloads(int fieldNumber) { result.expectedTermAndState()[result.expectedTermAndState().length - 1].term, deserialized.termsStats().maxTerm()); + TermData[] perTypeTermData = deserialized.termDataReader().newPerTypeTermDataReference(); for (var x : result.expectedTermAndState()) { IntBlockTermState expectedState = x.state; - IntBlockTermState actualState = deserialized.getTermState(x.term); + IntBlockTermState actualState = deserialized.getTermState(x.term, perTypeTermData); if (expectedState.singletonDocID != -1) { assertEquals(expectedState.singletonDocID, actualState.singletonDocID); } else { diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java index e0cd887c10c6..fc1b7b0f269b 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java @@ -73,24 +73,23 @@ public void testWriterAndDeserialize() throws IOException { } ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes()); ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata); - TermData expected = new TermData(() -> expectedMetadataSlice, () -> expectedDataSlice); + TermData expected = new TermData(expectedMetadataSlice, expectedDataSlice); IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT); IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT); IndexInput dataIn = testDir.openInput("term_data_11", IOContext.DEFAULT); - TermData actual = - TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); - assertByteSlice( - expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice()); - assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice()); - testDecodeTermState(testFixture, actual); + TermDataProvider actualProvider = + TermDataProvider.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); + assertByteSlice(expected.metadata(), actualProvider.metadataProvider().newByteSlice()); + assertByteSlice(expected.data(), actualProvider.dataProvider().newByteSlice()); + testDecodeTermState(testFixture, actualProvider); - actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); - assertByteSlice( - expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice()); - assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice()); - testDecodeTermState(testFixture, actual); + actualProvider = + TermDataProvider.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone()); + assertByteSlice(expected.metadata(), actualProvider.metadataProvider().newByteSlice()); + assertByteSlice(expected.data(), actualProvider.dataProvider().newByteSlice()); + testDecodeTermState(testFixture, actualProvider); metaIn.close(); metadataIn.close(); @@ -98,8 +97,12 @@ public void testWriterAndDeserialize() throws IOException { } } - private static void testDecodeTermState(TermStateTestFixture testFixture, TermData actual) - throws IOException { + private static void testDecodeTermState( + TermStateTestFixture testFixture, TermDataProvider actualProvider) throws IOException { + TermData actual = + new TermData( + actualProvider.metadataProvider().newByteSlice(), + actualProvider.dataProvider().newByteSlice()); for (int i = 0; i < testFixture.termStatesArray().length; i++) { IntBlockTermState expectedTermState = testFixture.termStatesArray()[i]; IntBlockTermState decoded = actual.getTermState(testFixture.codec(), i); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java index 7c72b3d2e76a..97454969be90 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java @@ -80,7 +80,7 @@ public void test() throws Exception { // System.out.println("query clone count=" + queryCloneCount); assertTrue( "too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount, - queryCloneCount < 50); + queryCloneCount < 100); r.close(); dir.close(); } From 8ab91393fe7e20919ed5bb3c5536eecfe161c5ba Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Wed, 22 Nov 2023 10:19:17 -0800 Subject: [PATCH 38/57] Add Lucene99RandomAccessTermDictCodec --- lucene/codecs/src/java/module-info.java | 3 +- .../Lucene99RandomAccessTermDictCodec.java | 42 +++++++++++++++++++ .../services/org.apache.lucene.codecs.Codec | 1 + 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java diff --git a/lucene/codecs/src/java/module-info.java b/lucene/codecs/src/java/module-info.java index a128950ddb56..b7e75f5917c0 100644 --- a/lucene/codecs/src/java/module-info.java +++ b/lucene/codecs/src/java/module-info.java @@ -39,5 +39,6 @@ org.apache.lucene.sandbox.codecs.lucene99.randomaccess .Lucene99RandomAccessDictionaryPostingsFormat; provides org.apache.lucene.codecs.Codec with - org.apache.lucene.codecs.simpletext.SimpleTextCodec; + org.apache.lucene.codecs.simpletext.SimpleTextCodec, + org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessTermDictCodec; } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java new file mode 100644 index 000000000000..edb6265c974a --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +public class Lucene99RandomAccessTermDictCodec extends FilterCodec { + private final Lucene99RandomAccessDictionaryPostingsFormat lucene99RandomAccessPostingsFormat = + new Lucene99RandomAccessDictionaryPostingsFormat(); + + public Lucene99RandomAccessTermDictCodec() { + super("Lucene99RandomAccessTermDict", new Lucene99Codec()); + } + + @Override + public PostingsFormat postingsFormat() { + return new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return lucene99RandomAccessPostingsFormat; + } + }; + } +} diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index fcd5ded3605c..bf0e25322963 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.simpletext.SimpleTextCodec +org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessTermDictCodec From 10d4181cd837afe3fcd8e837cc16bc90e70ae25b Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Wed, 22 Nov 2023 10:35:04 -0800 Subject: [PATCH 39/57] Fix build after mering from apache:main --- .../randomaccess/RandomAccessTermsDictWriter.java | 6 ++++-- .../sandbox/codecs/lucene99/randomaccess/TermsIndex.java | 8 ++++++-- .../codecs/lucene99/randomaccess/TermsIndexBuilder.java | 7 ++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java index 6a8a4a6a5f74..5002f81c03ea 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java @@ -40,7 +40,7 @@ final class RandomAccessTermsDictWriter { private final TermDataOutput[] termDataOutputPerType = new TermDataOutput[TermType.NUM_TOTAL_TYPES]; - private final TermsIndexBuilder termsIndexBuilder = new TermsIndexBuilder(); + private final TermsIndexBuilder termsIndexBuilder; private final TermDataWriter[] termDataWriterPerType = new TermDataWriter[TermType.NUM_TOTAL_TYPES]; @@ -55,13 +55,15 @@ final class RandomAccessTermsDictWriter { boolean hasPayloads, DataOutput metaOutput, DataOutput indexOutput, - TermDataOutputProvider termDataOutputProvider) { + TermDataOutputProvider termDataOutputProvider) + throws IOException { this.indexOptions = indexOptions; this.hasPayloads = hasPayloads; this.metaOutput = metaOutput; this.indexOutput = indexOutput; this.termDataOutputProvider = termDataOutputProvider; this.termStatsTracker = new TermStatsTracker(filedNumber); + this.termsIndexBuilder = new TermsIndexBuilder(); } void add(BytesRef term, IntBlockTermState termState) throws IOException { diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java index 9474a82bef78..a802026f9cb2 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java @@ -52,10 +52,14 @@ static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOf FST fst; if (loadOffHeap) { var fstStore = new OffHeapFSTStore(); - fst = new FST<>(metaIn, dataIn.clone(), PositiveIntOutputs.getSingleton(), fstStore); + fst = + new FST<>( + FST.readMetadata(metaIn, PositiveIntOutputs.getSingleton()), + dataIn.clone(), + fstStore); dataIn.skipBytes(fstStore.size()); } else { - fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton()); + fst = new FST<>(FST.readMetadata(metaIn, PositiveIntOutputs.getSingleton()), dataIn); } return new TermsIndex(fst); } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java index d142420d4470..35dd42e81cd5 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -34,10 +34,11 @@ final class TermsIndexBuilder { private static final long MAX_ORD = (1L << 60) - 1; private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES]; - private final FSTCompiler fstCompiler = - new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build(); + private final FSTCompiler fstCompiler; - TermsIndexBuilder() { + TermsIndexBuilder() throws IOException { + fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build(); Arrays.fill(countPerType, -1); } From ac1b77ff167bbb3e7967c5f78bbab3a8e1123917 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Wed, 22 Nov 2023 11:15:09 -0800 Subject: [PATCH 40/57] Add missing javadoc --- .../randomaccess/Lucene99RandomAccessTermDictCodec.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java index edb6265c974a..255da4ed80cb 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java @@ -22,6 +22,10 @@ import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +/** + * A Codec that uses {@link Lucene99RandomAccessDictionaryPostingsFormat} on top of {@link + * Lucene99Codec} + */ public class Lucene99RandomAccessTermDictCodec extends FilterCodec { private final Lucene99RandomAccessDictionaryPostingsFormat lucene99RandomAccessPostingsFormat = new Lucene99RandomAccessDictionaryPostingsFormat(); From aa0074d9139114179e6d2e271d5382eb3d856d20 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Fri, 24 Nov 2023 20:17:32 -0800 Subject: [PATCH 41/57] Optimize for less allocation --- .../randomaccess/ByteArrayByteSlice.java | 8 ++ .../lucene99/randomaccess/ByteSlice.java | 2 + .../RandomAccessInputByteSlice.java | 8 ++ .../randomaccess/RandomAccessTermsDict.java | 14 +- .../lucene99/randomaccess/TermData.java | 22 +++ .../lucene99/randomaccess/TermDataReader.java | 110 --------------- .../randomaccess/TermDataReaderProvider.java | 125 ++++++++++++++++++ .../lucene99/randomaccess/TermStateCodec.java | 3 + .../randomaccess/TermStateCodecImpl.java | 20 ++- .../lucene99/randomaccess/TermsImpl.java | 13 +- .../TestRandomAccessTermsDictWriter.java | 3 +- 11 files changed, 200 insertions(+), 128 deletions(-) delete mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java index 55139ebf3a32..269d1e4753ec 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java @@ -52,4 +52,12 @@ public byte[] getBytes(long pos, int length) { System.arraycopy(bytes, (int) pos, result, 0, length); return result; } + + @Override + public void readBytesTo(byte[] destination, long pos, int length) { + if (length == 0) { + return; + } + System.arraycopy(bytes, (int) pos, destination, 0, length); + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java index 937e915e3325..1a3a8a8f0f96 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java @@ -29,4 +29,6 @@ interface ByteSlice { long getLong(long pos) throws IOException; byte[] getBytes(long pos, int length) throws IOException; + + void readBytesTo(byte[] destination, long pos, int length) throws IOException; } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java index 3d80e50dd383..845b0f22aed4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java @@ -55,4 +55,12 @@ public byte[] getBytes(long pos, int length) throws IOException { randomAccessInput.readBytes(pos, result, 0, length); return result; } + + @Override + public void readBytesTo(byte[] destination, long pos, int length) throws IOException { + if (length == 0) { + return; + } + randomAccessInput.readBytes(pos, destination, 0, length); + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java index 712c832d93c5..f767c2d4ed99 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java @@ -28,13 +28,15 @@ record RandomAccessTermsDict( TermsStats termsStats, TermsIndex termsIndex, - TermDataReader termDataReader, + TermDataReaderProvider termDataReaderProvider, IndexOptions indexOptions) { - IntBlockTermState getTermState(BytesRef term, TermData[] termDataPerType) throws IOException { + /** test only * */ + IntBlockTermState getTermState(BytesRef term) throws IOException { TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term); - return termDataReader.getTermState( - typeAndOrd.termType(), typeAndOrd.ord(), indexOptions, termDataPerType); + return termDataReaderProvider + .newReader() + .getTermState(typeAndOrd.termType(), typeAndOrd.ord(), indexOptions); } static RandomAccessTermsDict deserialize( @@ -60,8 +62,8 @@ static RandomAccessTermsDict deserialize( int numTermTypes = metaInput.readByte(); // (3.2) read per TermType - TermDataReader.Builder termDataReaderBuilder = - new TermDataReader.Builder(indexOptions, hasPayloads); + TermDataReaderProvider.Builder termDataReaderBuilder = + new TermDataReaderProvider.Builder(indexOptions, hasPayloads); for (int i = 0; i < numTermTypes; i++) { TermType termType = TermType.fromId(metaInput.readByte()); TermDataInput termDataInput = termDataInputProvider.getTermDataInputForType(termType); diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java index 6eba1a0cd51c..c72bef50451e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -46,4 +46,26 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex); } + + IntBlockTermState getTermStateWithBuffer( + TermStateCodec codec, long ord, byte[] metaDataBuffer, byte[] dataBuffer) throws IOException { + long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; + long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); + long dataStartPos = metadata.getLong(metadataStartPos); + + metadata.readBytesTo(metaDataBuffer, metadataStartPos + 8, codec.getMetadataBytesLength()); + BytesRef metadataBytesRef = new BytesRef(metaDataBuffer); + + int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef); + int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK)); + int startBitIndex = dataBitIndex % 8; + int numBytesToRead = (startBitIndex + numBitsPerRecord) / 8; + if ((startBitIndex + numBitsPerRecord) % 8 > 0) { + numBytesToRead += 1; + } + data.readBytesTo(dataBuffer, dataStartPos + dataBitIndex / 8, numBytesToRead); + BytesRef dataBytesRef = new BytesRef(dataBuffer, 0, numBytesToRead); + + return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex); + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java deleted file mode 100644 index 7d9b701f9ab1..000000000000 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; - -import java.io.IOException; -import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.IndexInput; - -/** - * Holds all {@link TermData} per {@link TermType} for a field. Also manages the proper codec needed - * per TermType. - */ -record TermDataReader(TermDataProviderAndCodec[] termDataProviderAndCodecs) { - - IntBlockTermState getTermState( - TermType termType, long ord, IndexOptions indexOptions, TermData[] termDataPerType) - throws IOException { - assert termDataProviderAndCodecs[termType.getId()] != null; - assert termDataPerType.length == termDataProviderAndCodecs.length; - assert termDataPerType[termType.getId()] != null; - - var codec = termDataProviderAndCodecs[termType.getId()].codec; - IntBlockTermState termState = termDataPerType[termType.getId()].getTermState(codec, ord); - - // need to filling some default values for the term state - // in order to meet the expectations of the postings reader - if (termType.hasSingletonDoc()) { - termState.docFreq = 1; - } - if (termType.hasSkipData() == false) { - termState.skipOffset = -1; - } - if (termType.hasLastPositionBlockOffset() == false) { - termState.lastPosBlockOffset = -1; - } - - /* There is interesting conventions to follow... - *

-     *     org.apache.lucene.index.CheckIndex$CheckIndexException:
-     *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
-     * 
- */ - // for field that do not have freq enabled, as if each posting only has one occurrence. - if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) { - termState.totalTermFreq = termState.docFreq; - } - - return termState; - } - - TermData[] newPerTypeTermDataReference() throws IOException { - TermData[] result = new TermData[termDataProviderAndCodecs.length]; - for (int i = 0; i < result.length; i++) { - if (termDataProviderAndCodecs[i] == null) { - continue; - } - TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider; - result[i] = - new TermData( - termDataProvider.metadataProvider().newByteSlice(), - termDataProvider.dataProvider().newByteSlice()); - } - return result; - } - - static class Builder { - final IndexOptions indexOptions; - final boolean hasPayloads; - final TermDataProviderAndCodec[] termDataProviderAndCodecs = - new TermDataProviderAndCodec[TermType.NUM_TOTAL_TYPES]; - - Builder(IndexOptions indexOptions, boolean hasPayloads) { - this.indexOptions = indexOptions; - this.hasPayloads = hasPayloads; - } - - void readOne( - TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn) - throws IOException { - TermDataProvider termDataProvider = - TermDataProvider.deserializeOffHeap(metaIn, termMetadataIn, termDataIn); - TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads); - termDataProviderAndCodecs[termType.getId()] = - new TermDataProviderAndCodec(termDataProvider, codec); - } - - TermDataReader build() { - return new TermDataReader(termDataProviderAndCodecs); - } - } - - record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {} -} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java new file mode 100644 index 000000000000..3572cc90773e --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; + +/** Factory class for {@link TermDataReader} which supports term lookup */ +record TermDataReaderProvider(TermDataProviderAndCodec[] termDataProviderAndCodecs) { + + TermDataReader newReader() throws IOException { + return new TermDataReader(); + } + + static class Builder { + final IndexOptions indexOptions; + final boolean hasPayloads; + final TermDataProviderAndCodec[] termDataProviderAndCodecs = + new TermDataProviderAndCodec[TermType.NUM_TOTAL_TYPES]; + + Builder(IndexOptions indexOptions, boolean hasPayloads) { + this.indexOptions = indexOptions; + this.hasPayloads = hasPayloads; + } + + void readOne( + TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn) + throws IOException { + TermDataProvider termDataProvider = + TermDataProvider.deserializeOffHeap(metaIn, termMetadataIn, termDataIn); + TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads); + termDataProviderAndCodecs[termType.getId()] = + new TermDataProviderAndCodec(termDataProvider, codec); + } + + TermDataReaderProvider build() { + return new TermDataReaderProvider(termDataProviderAndCodecs); + } + } + + record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {} + + public class TermDataReader { + private final TermData[] termDataPerType; + + private final byte[][] metaDataBufferPerType; + + private final byte[][] dataBufferPerType; + + TermDataReader() throws IOException { + termDataPerType = new TermData[termDataProviderAndCodecs.length]; + metaDataBufferPerType = new byte[termDataProviderAndCodecs.length][]; + dataBufferPerType = new byte[termDataProviderAndCodecs.length][]; + + for (int i = 0; i < termDataProviderAndCodecs.length; i++) { + if (termDataProviderAndCodecs[i] == null) { + continue; + } + var codec = termDataProviderAndCodecs[i].codec; + TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider; + termDataPerType[i] = + new TermData( + termDataProvider.metadataProvider().newByteSlice(), + termDataProvider.dataProvider().newByteSlice()); + metaDataBufferPerType[i] = new byte[codec.getMetadataBytesLength()]; + dataBufferPerType[i] = new byte[codec.getMaximumRecordSizeInBytes()]; + } + } + + IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions) + throws IOException { + assert termDataProviderAndCodecs[termType.getId()] != null; + assert termDataPerType[termType.getId()] != null; + + int typeId = termType.getId(); + var codec = termDataProviderAndCodecs[termType.getId()].codec; + IntBlockTermState termState = + termDataPerType[typeId].getTermStateWithBuffer( + codec, ord, metaDataBufferPerType[typeId], dataBufferPerType[typeId]); + + // need to filling some default values for the term state + // in order to meet the expectations of the postings reader + if (termType.hasSingletonDoc()) { + termState.docFreq = 1; + } + if (termType.hasSkipData() == false) { + termState.skipOffset = -1; + } + if (termType.hasLastPositionBlockOffset() == false) { + termState.lastPosBlockOffset = -1; + } + + /* There is interesting conventions to follow... + *
+       *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+       *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+       * 
+ */ + // for field that do not have freq enabled, as if each posting only has one occurrence. + if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) { + termState.totalTermFreq = termState.docFreq; + } + + return termState; + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java index 283512c7ae6a..1ef79ab7f158 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java @@ -28,6 +28,9 @@ interface TermStateCodec { /** Get the number of bytes that the metadata per block needs. */ int getMetadataBytesLength(); + /** Get the maximum span of a record in terms of bytes */ + int getMaximumRecordSizeInBytes(); + /** Get the number of bits per data record within the block, based on the provided metadata. */ int getNumBitsPerRecord(BytesRef metadataBytes); diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index d1a8392a37a9..319c14faafba 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -51,6 +51,13 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) { this.metadataBytesLength = metadataBytesLength; } + @Override + public int getMaximumRecordSizeInBytes() { + // worst case: no compression at all, so each component taks 8 byte. + // two extra bytes when the record takes partial byte at the start and end. + return components.length * 8 + 2; + } + @Override public int getMetadataBytesLength() { return metadataBytesLength; @@ -58,7 +65,18 @@ public int getMetadataBytesLength() { @Override public int getNumBitsPerRecord(BytesRef metadataBytes) { - return deserializedMetadata(metadataBytes).totalBitsPerTermState; + int upto = metadataBytes.offset; + int totalBitsPerTermState = 0; + + for (var component : components) { + byte bitWidth = metadataBytes.bytes[upto++]; + if (component.isMonotonicallyIncreasing()) { + upto += 8; + } + totalBitsPerTermState += bitWidth; + } + + return totalBitsPerTermState; } private static int getMetadataLength(TermStateCodecComponent component) { diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index c521cba8a8ce..d3977e4d5252 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -129,25 +129,20 @@ final class RandomAccessTermsEnum extends TermsEnum { // We need to re-seek in next() calls to catch up to that term. private boolean needReSeekInNext; - private TermData[] perTypeTermData; + private final TermDataReaderProvider.TermDataReader termDataReader; RandomAccessTermsEnum() throws IOException { termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst()); - perTypeTermData = termsDict.termDataReader().newPerTypeTermDataReference(); + termDataReader = termsDict.termDataReaderProvider().newReader(); } void updateTermStateIfNeeded() throws IOException { if (!isTermStateCurrent && !needReSeekInNext) { TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstSeekState.output); termState = - termsDict - .termDataReader() - .getTermState( - typeAndOrd.termType(), - typeAndOrd.ord(), - fieldInfo.getIndexOptions(), - perTypeTermData); + termDataReader.getTermState( + typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions()); isTermStateCurrent = true; } } diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java index b02d4de0cebf..226a4700813c 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java @@ -138,10 +138,9 @@ public boolean hasPayloads(int fieldNumber) { result.expectedTermAndState()[result.expectedTermAndState().length - 1].term, deserialized.termsStats().maxTerm()); - TermData[] perTypeTermData = deserialized.termDataReader().newPerTypeTermDataReference(); for (var x : result.expectedTermAndState()) { IntBlockTermState expectedState = x.state; - IntBlockTermState actualState = deserialized.getTermState(x.term, perTypeTermData); + IntBlockTermState actualState = deserialized.getTermState(x.term); if (expectedState.singletonDocID != -1) { assertEquals(expectedState.singletonDocID, actualState.singletonDocID); } else { From 46b46e64588b1e3a095db51bcc804816876f3c36 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sat, 25 Nov 2023 10:39:50 -0800 Subject: [PATCH 42/57] Make decode TermState allocation-free --- .../randomaccess/TermStateCodecImpl.java | 52 ++++--------------- 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index 319c14faafba..adef80cba696 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -32,8 +32,8 @@ import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.TotalTermFreq; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker; import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker; -import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; final class TermStateCodecImpl implements TermStateCodec { @@ -205,10 +205,8 @@ public IntBlockTermState decodeWithinBlock( BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index) { assert metadataBytes.length == this.metadataBytesLength; - var metadata = deserializedMetadata(metadataBytes); - - int startBitIndex = index * metadata.totalBitsPerTermState; - return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent); + int startBitIndex = index * getNumBitsPerRecord(metadataBytes); + return decodeAt(metadataBytes, dataBytes, bitUnpacker, startBitIndex); } @Override @@ -216,51 +214,23 @@ public IntBlockTermState decodeAt( BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex) { assert metadataBytes.length == this.metadataBytesLength; - var metadata = deserializedMetadata(metadataBytes); - return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent); - } + int upto = metadataBytes.offset; + IntBlockTermState decoded = new IntBlockTermState(); - private MetadataAndTotalBitsPerTermState deserializedMetadata(BytesRef metadataBytes) { - Metadata[] metadataPerComponent = new Metadata[components.length]; - ByteArrayDataInput byteArrayDataInput = - new ByteArrayDataInput(metadataBytes.bytes, metadataBytes.offset, metadataBytes.length); - int totalBitsPerTermState = 0; for (int i = 0; i < components.length; i++) { var component = components[i]; - byte bitWidth = byteArrayDataInput.readByte(); - long referenceValue = -1; + int bitWidth = metadataBytes.bytes[upto++]; + long val = bitUnpacker.unpack(dataBytes, startBitIndex, bitWidth); if (component.isMonotonicallyIncreasing()) { - referenceValue = byteArrayDataInput.readLong(); - } - metadataPerComponent[i] = new Metadata(bitWidth, referenceValue); - - totalBitsPerTermState += bitWidth; - } - - return new MetadataAndTotalBitsPerTermState(metadataPerComponent, totalBitsPerTermState); - } - - private IntBlockTermState extract( - BytesRef dataBytes, - BitUnpacker bitUnpacker, - int startBitIndex, - Metadata[] metadataPerComponent) { - IntBlockTermState decoded = new IntBlockTermState(); - for (int i = 0; i < components.length; i++) { - var component = components[i]; - var metadata = metadataPerComponent[i]; - long val = bitUnpacker.unpack(dataBytes, startBitIndex, metadata.bitWidth); - if (metadata.referenceValue > 0) { - val += metadata.referenceValue; + val += (long) BitUtil.VH_LE_LONG.get(metadataBytes.bytes, upto); + upto += 8; } component.setTargetValue(decoded, val); - startBitIndex += metadata.bitWidth; + startBitIndex += bitWidth; } + return decoded; } private record Metadata(byte bitWidth, long referenceValue) {} - - private record MetadataAndTotalBitsPerTermState( - Metadata[] metadataPerComponent, int totalBitsPerTermState) {} } From 2c875e7710df1c5a2db1d9da3f8e9f90266ac429 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sat, 25 Nov 2023 17:23:13 -0800 Subject: [PATCH 43/57] Use ThreadLocal to reuse TermDataReader data objects --- .../randomaccess/TermDataReaderProvider.java | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java index 3572cc90773e..a65e9b1304c5 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java @@ -24,10 +24,25 @@ import org.apache.lucene.store.IndexInput; /** Factory class for {@link TermDataReader} which supports term lookup */ -record TermDataReaderProvider(TermDataProviderAndCodec[] termDataProviderAndCodecs) { +final class TermDataReaderProvider { + private final TermDataProviderAndCodec[] termDataProviderAndCodecs; + + /** TermDataReader can be reused by the same thread */ + private final ThreadLocal termDataReaderReuse; + + TermDataReaderProvider(TermDataProviderAndCodec[] termDataProviderAndCodecs) { + this.termDataProviderAndCodecs = termDataProviderAndCodecs; + termDataReaderReuse = new ThreadLocal<>(); + } TermDataReader newReader() throws IOException { - return new TermDataReader(); + var existingReader = termDataReaderReuse.get(); + if (existingReader != null) { + return existingReader; + } + var newReader = new TermDataReader(); + termDataReaderReuse.set(newReader); + return newReader; } static class Builder { From 6a71a8124ad8aa8709467c134dbb12bf79343b76 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sun, 26 Nov 2023 10:37:08 -0800 Subject: [PATCH 44/57] Forked FST.java to work with primtive long Try it with TermsIndexPrimitive and verify basic functionality --- .../randomaccess/TermsIndexPrimitive.java | 56 + .../randomaccess/TestTermsIndexBuilder.java | 16 +- .../lucene/util/fst/PrimitiveLongFST.java | 1329 +++++++++++++++++ 3 files changed, 1400 insertions(+), 1 deletion(-) create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java new file mode 100644 index 000000000000..95e307d786d1 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.lucene99.randomaccess; + +import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermsIndex.decodeLong; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.OffHeapFSTStore; +import org.apache.lucene.util.fst.PrimitiveLongFST; + +record TermsIndexPrimitive(PrimitiveLongFST primitiveLongFST) { + + TermsIndex.TypeAndOrd getTerm(BytesRef term) throws IOException { + long encoded = PrimitiveLongFST.get(primitiveLongFST, term); + return decodeLong(encoded); + } + + static TermsIndexPrimitive deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap) + throws IOException { + PrimitiveLongFST fst; + if (loadOffHeap) { + var fstStore = new OffHeapFSTStore(); + fst = + new PrimitiveLongFST( + PrimitiveLongFST.readMetadata( + metaIn, PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()), + dataIn.clone(), + fstStore); + dataIn.skipBytes(fstStore.size()); + } else { + fst = + new PrimitiveLongFST( + PrimitiveLongFST.readMetadata( + metaIn, PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()), + dataIn); + } + return new TermsIndexPrimitive(fst); + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java index 7179c23d1d7e..9528dcd69b0d 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java @@ -20,6 +20,9 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; @@ -49,9 +52,20 @@ public void testBasics() throws IOException { } TermsIndex termsIndex = builder.build(); + byte[] metaBytes = new byte[4096]; + byte[] dataBytes = new byte[4096]; + DataOutput metaOut = new ByteArrayDataOutput(metaBytes); + DataOutput dataOutput = new ByteArrayDataOutput(dataBytes); + + termsIndex.serialize(metaOut, dataOutput); + + TermsIndexPrimitive termsIndexPrimitive = + TermsIndexPrimitive.deserialize( + new ByteArrayDataInput(metaBytes), new ByteArrayDataInput(dataBytes), false); + for (String term : test_terms) { BytesRef termBytes = new BytesRef(term); - TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(termBytes); + TermsIndex.TypeAndOrd typeAndOrd = termsIndexPrimitive.getTerm(termBytes); assertEquals(termsToType.get(term).intValue(), typeAndOrd.termType().getId()); assertEquals((long) termsToOrd.get(term), typeAndOrd.ord()); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java new file mode 100644 index 000000000000..c4a188fc58e6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java @@ -0,0 +1,1329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Constants; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc.BitTable; + +/** + * HACK! + * + *

A copy of {@link FST} but remove generics to work with primitive types and avoid + * boxing-unboxing. + * + * @lucene.experimental + */ +public final class PrimitiveLongFST implements Accountable { + + final PrimitiveLongFSTMetadata metadata; + + /** Specifies allowed range of each int input label for this FST. */ + public enum INPUT_TYPE { + BYTE1, + BYTE2, + BYTE4 + } + + private static final long BASE_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(PrimitiveLongFST.class); + + static final int BIT_FINAL_ARC = 1 << 0; + static final int BIT_LAST_ARC = 1 << 1; + static final int BIT_TARGET_NEXT = 1 << 2; + + // TODO: we can free up a bit if we can nuke this: + static final int BIT_STOP_NODE = 1 << 3; + + /** This flag is set if the arc has an output. */ + public static final int BIT_ARC_HAS_OUTPUT = 1 << 4; + + static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; + + /** + * Value of the arc flags to declare a node with fixed length (sparse) arcs designed for binary + * search. + */ + // We use this as a marker because this one flag is illegal by itself. + public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT; + + /** + * Value of the arc flags to declare a node with fixed length dense arcs and bit table designed + * for direct addressing. + */ + static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6; + + /** + * Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly + * with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations + * that will not occur at the same time. + */ + static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH; + + // Increment version to change it + private static final String FILE_FORMAT_NAME = "FST"; + private static final int VERSION_START = 6; + private static final int VERSION_LITTLE_ENDIAN = 8; + private static final int VERSION_CONTINUOUS_ARCS = 9; + static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS; + + // Never serialized; just used to represent the virtual + // final node w/ no arcs: + static final long FINAL_END_NODE = -1; + + // Never serialized; just used to represent the virtual + // non-final node w/ no arcs: + static final long NON_FINAL_END_NODE = 0; + + /** If arc has this label then that arc is final/accepted */ + public static final int END_LABEL = -1; + + /** + * A {@link BytesStore}, used during building, or during reading when the FST is very large (more + * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. + */ + private final FSTReader fstReader; + + public final PrimitiveLongFSTOutputs outputs; + + /** Represents a single arc. */ + public static final class PrimitiveLongArc { + + // *** Arc fields. + + private int label; + + private long output; + + private long target; + + private byte flags; + + private long nextFinalOutput; + + private long nextArc; + + private byte nodeFlags; + + // *** Fields for arcs belonging to a node with fixed length arcs. + // So only valid when bytesPerArc != 0. + // nodeFlags == ARCS_FOR_BINARY_SEARCH || nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. + + private int bytesPerArc; + + private long posArcsStart; + + private int arcIdx; + + private int numArcs; + + // *** Fields for a direct addressing node. nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. + + /** + * Start position in the {@link BytesReader} of the presence bits for a direct addressing node, + * aka the bit-table + */ + private long bitTableStart; + + /** First label of a direct addressing node. */ + private int firstLabel; + + /** + * Index of the current label of a direct addressing node. While {@link #arcIdx} is the current + * index in the label range, {@link #presenceIndex} is its corresponding index in the list of + * actually present labels. It is equal to the number of bits set before the bit at {@link + * #arcIdx} in the bit-table. This field is a cache to avoid to count bits set repeatedly when + * iterating the next arcs. + */ + private int presenceIndex; + + /** Returns this */ + public PrimitiveLongArc copyFrom(PrimitiveLongArc other) { + label = other.label(); + target = other.target(); + flags = other.flags(); + output = other.output(); + nextFinalOutput = other.nextFinalOutput(); + nextArc = other.nextArc(); + nodeFlags = other.nodeFlags(); + bytesPerArc = other.bytesPerArc(); + + // Fields for arcs belonging to a node with fixed length arcs. + // We could avoid copying them if bytesPerArc() == 0 (this was the case with previous code, + // and the current code + // still supports that), but it may actually help external uses of FST to have consistent arc + // state, and debugging + // is easier. + posArcsStart = other.posArcsStart(); + arcIdx = other.arcIdx(); + numArcs = other.numArcs(); + bitTableStart = other.bitTableStart; + firstLabel = other.firstLabel(); + presenceIndex = other.presenceIndex; + + return this; + } + + boolean flag(int flag) { + return PrimitiveLongFST.flag(flags, flag); + } + + public boolean isLast() { + return flag(BIT_LAST_ARC); + } + + public boolean isFinal() { + return flag(BIT_FINAL_ARC); + } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append(" target=").append(target()); + b.append(" label=0x").append(Integer.toHexString(label())); + if (flag(BIT_FINAL_ARC)) { + b.append(" final"); + } + if (flag(BIT_LAST_ARC)) { + b.append(" last"); + } + if (flag(BIT_TARGET_NEXT)) { + b.append(" targetNext"); + } + if (flag(BIT_STOP_NODE)) { + b.append(" stop"); + } + if (flag(BIT_ARC_HAS_OUTPUT)) { + b.append(" output=").append(output()); + } + if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + b.append(" nextFinalOutput=").append(nextFinalOutput()); + } + if (bytesPerArc() != 0) { + b.append(" arcArray(idx=") + .append(arcIdx()) + .append(" of ") + .append(numArcs()) + .append(")") + .append("(") + .append( + nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING + ? "da" + : nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs") + .append(")"); + } + return b.toString(); + } + + public int label() { + return label; + } + + public long output() { + return output; + } + + /** Ord/address to target node. */ + public long target() { + return target; + } + + public byte flags() { + return flags; + } + + public long nextFinalOutput() { + return nextFinalOutput; + } + + /** + * Address (into the byte[]) of the next arc - only for list of variable length arc. Or + * ord/address to the next node if label == {@link #END_LABEL}. + */ + long nextArc() { + return nextArc; + } + + /** Where we are in the array; only valid if bytesPerArc != 0. */ + public int arcIdx() { + return arcIdx; + } + + /** + * Node header flags. Only meaningful to check if the value is either {@link + * #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link + * #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0). + */ + public byte nodeFlags() { + return nodeFlags; + } + + /** Where the first arc in the array starts; only valid if bytesPerArc != 0 */ + public long posArcsStart() { + return posArcsStart; + } + + /** + * Non-zero if this arc is part of a node with fixed length arcs, which means all arcs for the + * node are encoded with a fixed number of bytes so that we binary search or direct address. We + * do when there are enough arcs leaving one node. It wastes some bytes but gives faster + * lookups. + */ + public int bytesPerArc() { + return bytesPerArc; + } + + /** + * How many arcs; only valid if bytesPerArc != 0 (fixed length arcs). For a node designed for + * binary search this is the array size. For a node designed for direct addressing, this is the + * label range. + */ + public int numArcs() { + return numArcs; + } + + /** + * First label of a direct addressing node. Only valid if nodeFlags == {@link + * #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}. + */ + int firstLabel() { + return firstLabel; + } + + /** + * Helper methods to read the bit-table of a direct addressing node. Only valid for {@link + * PrimitiveLongArc} with {@link PrimitiveLongArc#nodeFlags()} == {@code + * ARCS_FOR_DIRECT_ADDRESSING}. + */ + static class BitTable { + + /** See {@link BitTableUtil#isBitSet(int, BytesReader)}. */ + static boolean isBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in) + throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.isBitSet(bitIndex, in); + } + + /** + * See {@link BitTableUtil#countBits(int, BytesReader)}. The count of bit set is the number of + * arcs of a direct addressing node. + */ + static int countBits(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.countBits(getNumPresenceBytes(arc.numArcs()), in); + } + + /** See {@link BitTableUtil#countBitsUpTo(int, BytesReader)}. */ + static int countBitsUpTo(int bitIndex, PrimitiveLongArc arc, BytesReader in) + throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.countBitsUpTo(bitIndex, in); + } + + /** See {@link BitTableUtil#nextBitSet(int, int, BytesReader)}. */ + static int nextBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.nextBitSet(bitIndex, getNumPresenceBytes(arc.numArcs()), in); + } + + /** See {@link BitTableUtil#previousBitSet(int, BytesReader)}. */ + static int previousBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in) + throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.previousBitSet(bitIndex, in); + } + + /** Asserts the bit-table of the provided {@link PrimitiveLongArc} is valid. */ + static boolean assertIsValid(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + // First bit must be set. + assert isBitSet(0, arc, in); + // Last bit must be set. + assert isBitSet(arc.numArcs() - 1, arc, in); + // No bit set after the last arc. + assert nextBitSet(arc.numArcs() - 1, arc, in) == -1; + return true; + } + } + } + + private static boolean flag(int flags, int bit) { + return (flags & bit) != 0; + } + + private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; + + /** + * Load a previously saved FST with a DataInput for metdata using an {@link OnHeapFSTStore} with + * maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS} + */ + public PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, DataInput in) throws IOException { + this(metadata, in, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS)); + } + + /** + * Load a previously saved FST with a metdata object and a FSTStore. If using {@link + * OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used + * to hold the FST bytes. + */ + public PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, DataInput in, FSTStore fstStore) + throws IOException { + this(metadata, fstStore.init(in, metadata.numBytes)); + } + + /** Create the FST with a metadata object and a FSTReader. */ + PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, FSTReader fstReader) { + this.metadata = metadata; + this.outputs = metadata.outputs; + this.fstReader = fstReader; + } + + /** + * Read the FST metadata from DataInput + * + * @param metaIn the DataInput of the metadata + * @param outputs the FST outputs + * @return the FST metadata + * @throws IOException if exception occurred during parsing + */ + public static PrimitiveLongFSTMetadata readMetadata( + DataInput metaIn, PrimitiveLongFSTOutputs outputs) throws IOException { + // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have + // back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it + int version = CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT); + Long emptyOutput; + if (metaIn.readByte() == 1) { + // accepts empty string + // 1 KB blocks: + BytesStore emptyBytes = new BytesStore(10); + int numBytes = metaIn.readVInt(); + emptyBytes.copyBytes(metaIn, numBytes); + + // De-serialize empty-string output: + BytesReader reader = emptyBytes.getReverseBytesReader(); + // NoOutputs uses 0 bytes when writing its output, + // so we have to check here else BytesStore gets + // angry: + if (numBytes > 0) { + reader.setPosition(numBytes - 1); + } + emptyOutput = outputs.readFinalOutput(reader); + } else { + emptyOutput = null; + } + INPUT_TYPE inputType; + final byte t = metaIn.readByte(); + switch (t) { + case 0: + inputType = INPUT_TYPE.BYTE1; + break; + case 1: + inputType = INPUT_TYPE.BYTE2; + break; + case 2: + inputType = INPUT_TYPE.BYTE4; + break; + default: + throw new CorruptIndexException("invalid input type " + t, metaIn); + } + long startNode = metaIn.readVLong(); + long numBytes = metaIn.readVLong(); + return new PrimitiveLongFSTMetadata( + inputType, outputs, emptyOutput, startNode, version, numBytes); + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed(); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(input=" + metadata.inputType + ",output=" + outputs; + } + + public long numBytes() { + return metadata.numBytes; + } + + public long getEmptyOutput() { + return metadata.emptyOutput.longValue(); + } + + public PrimitiveLongFSTMetadata getMetadata() { + return metadata; + } + + public void save(DataOutput metaOut, DataOutput out) throws IOException { + saveMetadata(metaOut); + fstReader.writeTo(out); + } + + /** + * Save the metadata to a DataOutput + * + * @param metaOut the DataOutput to save + */ + public void saveMetadata(DataOutput metaOut) throws IOException { + CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); + + // Accepts empty string + metaOut.writeByte((byte) 1); + + if (metadata.emptyOutput != null) { + // Serialize empty-string output: + ByteBuffersDataOutput ros = new ByteBuffersDataOutput(); + outputs.writeFinalOutput(metadata.emptyOutput.longValue(), ros); + byte[] emptyOutputBytes = ros.toArrayCopy(); + int emptyLen = emptyOutputBytes.length; + + // reverse + final int stopAt = emptyLen / 2; + int upto = 0; + while (upto < stopAt) { + final byte b = emptyOutputBytes[upto]; + emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1]; + emptyOutputBytes[emptyLen - upto - 1] = b; + upto++; + } + metaOut.writeVInt(emptyLen); + metaOut.writeBytes(emptyOutputBytes, 0, emptyLen); + } else { + metaOut.writeByte((byte) 0); + } + + final byte t; + if (metadata.inputType == INPUT_TYPE.BYTE1) { + t = 0; + } else if (metadata.inputType == INPUT_TYPE.BYTE2) { + t = 1; + } else { + t = 2; + } + metaOut.writeByte(t); + metaOut.writeVLong(metadata.startNode); + metaOut.writeVLong(numBytes()); + } + + /** Writes an automaton to a file. */ + public void save(final Path path) throws IOException { + try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { + DataOutput out = new OutputStreamDataOutput(os); + save(out, out); + } + } + + /** Reads an automaton from a file. */ + public static PrimitiveLongFST read(Path path, PrimitiveLongFSTOutputs outputs) + throws IOException { + try (InputStream is = Files.newInputStream(path)) { + DataInput in = new InputStreamDataInput(new BufferedInputStream(is)); + return new PrimitiveLongFST(readMetadata(in, outputs), in); + } + } + + /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */ + public int readLabel(DataInput in) throws IOException { + final int v; + if (metadata.inputType == INPUT_TYPE.BYTE1) { + // Unsigned byte: + v = in.readByte() & 0xFF; + } else if (metadata.inputType == INPUT_TYPE.BYTE2) { + // Unsigned short: + if (metadata.version < VERSION_LITTLE_ENDIAN) { + v = Short.reverseBytes(in.readShort()) & 0xFFFF; + } else { + v = in.readShort() & 0xFFFF; + } + } else { + v = in.readVInt(); + } + return v; + } + + /** returns true if the node at this address has any outgoing arcs */ + public static boolean targetHasArcs(PrimitiveLongArc arc) { + return arc.target() > 0; + } + + /** + * Gets the number of bytes required to flag the presence of each arc in the given label range, + * one bit per arc. + */ + static int getNumPresenceBytes(int labelRange) { + assert labelRange >= 0; + return (labelRange + 7) >> 3; + } + + /** + * Reads the presence bits of a direct-addressing node. Actually we don't read them here, we just + * keep the pointer to the bit-table start and we skip them. + */ + private void readPresenceBytes(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + arc.bitTableStart = in.getPosition(); + in.skipBytes(getNumPresenceBytes(arc.numArcs())); + } + + /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */ + public PrimitiveLongArc getFirstArc(PrimitiveLongArc arc) { + long NO_OUTPUT = outputs.getNoOutput(); + + arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; + if (metadata.emptyOutput != null) { + arc.nextFinalOutput = metadata.emptyOutput.longValue(); + } + if (metadata.emptyOutput != null && metadata.emptyOutput.longValue() != NO_OUTPUT) { + arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); + } + + arc.output = NO_OUTPUT; + + // If there are no nodes, ie, the FST only accepts the + // empty string, then startNode is 0 + arc.target = metadata.startNode; + return arc; + } + + /** + * Follows the follow arc and reads the last arc of its target; this changes the + * provided arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument (arc). + */ + PrimitiveLongArc readLastTargetArc(PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in) + throws IOException { + // System.out.println("readLast"); + if (!targetHasArcs(follow)) { + // System.out.println(" end node"); + assert follow.isFinal(); + arc.label = END_LABEL; + arc.target = FINAL_END_NODE; + arc.output = follow.nextFinalOutput(); + arc.flags = BIT_LAST_ARC; + arc.nodeFlags = arc.flags; + return arc; + } else { + in.setPosition(follow.target()); + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS) { + // Special arc which is actually a node header for fixed length arcs. + // Jump straight to end to find the last arc. + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + // System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + readLastArcByDirectAddressing(arc, in); + } else if (flags == ARCS_FOR_BINARY_SEARCH) { + arc.arcIdx = arc.numArcs() - 2; + arc.posArcsStart = in.getPosition(); + readNextRealArc(arc, in); + } else { + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + readLastArcByContinuous(arc, in); + } + } else { + arc.flags = flags; + // non-array: linear scan + arc.bytesPerArc = 0; + // System.out.println(" scan"); + while (!arc.isLast()) { + // skip this arc: + readLabel(in); + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + if (arc.flag(BIT_STOP_NODE)) { + } else if (arc.flag(BIT_TARGET_NEXT)) { + } else { + readUnpackedNodeTarget(in); + } + arc.flags = in.readByte(); + } + // Undo the byte flags we read: + in.skipBytes(-1); + arc.nextArc = in.getPosition(); + readNextRealArc(arc, in); + } + assert arc.isLast(); + return arc; + } + } + + private long readUnpackedNodeTarget(BytesReader in) throws IOException { + return in.readVLong(); + } + + /** + * Follow the follow arc and read the first arc of its target; this changes the + * provided arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument (arc). + */ + public PrimitiveLongArc readFirstTargetArc( + PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in) throws IOException { + // int pos = address; + // System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + + // follow.isFinal()); + if (follow.isFinal()) { + // Insert "fake" final first arc: + arc.label = END_LABEL; + arc.output = follow.nextFinalOutput(); + arc.flags = BIT_FINAL_ARC; + if (follow.target() <= 0) { + arc.flags |= BIT_LAST_ARC; + } else { + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.target = FINAL_END_NODE; + arc.nodeFlags = arc.flags; + // System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + + // arc.isLast() + " output=" + outputs.outputToString(arc.output)); + return arc; + } else { + return readFirstRealTargetArc(follow.target(), arc, in); + } + } + + private void readFirstArcInfo(long nodeAddress, PrimitiveLongArc arc, final BytesReader in) + throws IOException { + in.setPosition(nodeAddress); + + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS) { + // Special arc which is actually a node header for fixed length arcs. + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + arc.arcIdx = -1; + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.presenceIndex = -1; + } else if (flags == ARCS_FOR_CONTINUOUS) { + arc.firstLabel = readLabel(in); + } + arc.posArcsStart = in.getPosition(); + } else { + arc.nextArc = nodeAddress; + arc.bytesPerArc = 0; + } + } + + public PrimitiveLongArc readFirstRealTargetArc( + long nodeAddress, PrimitiveLongArc arc, final BytesReader in) throws IOException { + readFirstArcInfo(nodeAddress, arc, in); + return readNextRealArc(arc, in); + } + + /** + * Returns whether arc's target points to a node in expanded format (fixed length + * arcs). + */ + boolean isExpandedTarget(PrimitiveLongArc follow, BytesReader in) throws IOException { + if (!targetHasArcs(follow)) { + return false; + } else { + in.setPosition(follow.target()); + byte flags = in.readByte(); + return flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS; + } + } + + /** In-place read; returns the arc. */ + public PrimitiveLongArc readNextArc(PrimitiveLongArc arc, BytesReader in) throws IOException { + if (arc.label() == END_LABEL) { + // This was a fake inserted "final" arc + if (arc.nextArc() <= 0) { + throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); + } + return readFirstRealTargetArc(arc.nextArc(), arc, in); + } else { + return readNextRealArc(arc, in); + } + } + + /** Peeks at next arc's label; does not alter arc. Do not call this if arc.isLast()! */ + int readNextArcLabel(PrimitiveLongArc arc, BytesReader in) throws IOException { + assert !arc.isLast(); + + if (arc.label() == END_LABEL) { + // System.out.println(" nextArc fake " + arc.nextArc); + // Next arc is the first arc of a node. + // Position to read the first arc label. + + in.setPosition(arc.nextArc()); + byte flags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH + || flags == ARCS_FOR_DIRECT_ADDRESSING + || flags == ARCS_FOR_CONTINUOUS) { + // System.out.println(" nextArc fixed length arc"); + // Special arc which is actually a node header for fixed length arcs. + int numArcs = in.readVInt(); + in.readVInt(); // Skip bytesPerArc. + if (flags == ARCS_FOR_BINARY_SEARCH) { + in.readByte(); // Skip arc flags. + } else if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + in.skipBytes(getNumPresenceBytes(numArcs)); + } // Nothing to do for ARCS_FOR_CONTINUOUS + } + } else { + switch (arc.nodeFlags()) { + case ARCS_FOR_BINARY_SEARCH: + // Point to next arc, -1 to skip arc flags. + in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * (long) arc.bytesPerArc() - 1); + break; + case ARCS_FOR_DIRECT_ADDRESSING: + // Direct addressing node. The label is not stored but rather inferred + // based on first label and arc index in the range. + assert BitTable.assertIsValid(arc, in); + assert BitTable.isBitSet(arc.arcIdx(), arc, in); + int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); + assert nextIndex != -1; + return arc.firstLabel() + nextIndex; + case ARCS_FOR_CONTINUOUS: + return arc.firstLabel() + arc.arcIdx() + 1; + default: + // Variable length arcs - linear search. + assert arc.bytesPerArc() == 0; + // Arcs have variable length. + // System.out.println(" nextArc real list"); + // Position to next arc, -1 to skip flags. + in.setPosition(arc.nextArc() - 1); + break; + } + } + return readLabel(in); + } + + public PrimitiveLongArc readArcByIndex(PrimitiveLongArc arc, final BytesReader in, int idx) + throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH; + assert idx >= 0 && idx < arc.numArcs(); + in.setPosition(arc.posArcsStart() - idx * (long) arc.bytesPerArc()); + arc.arcIdx = idx; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads a Continuous node arc, with the provided index in the label range. + * + * @param rangeIndex The index of the arc in the label range. It must be within the label range. + */ + public PrimitiveLongArc readArcByContinuous( + PrimitiveLongArc arc, final BytesReader in, int rangeIndex) throws IOException { + assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); + in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc()); + arc.arcIdx = rangeIndex; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads a present direct addressing node arc, with the provided index in the label range. + * + * @param rangeIndex The index of the arc in the label range. It must be present. The real arc + * offset is computed based on the presence bits of the direct addressing node. + */ + public PrimitiveLongArc readArcByDirectAddressing( + PrimitiveLongArc arc, final BytesReader in, int rangeIndex) throws IOException { + assert BitTable.assertIsValid(arc, in); + assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); + assert BitTable.isBitSet(rangeIndex, arc, in); + int presenceIndex = BitTable.countBitsUpTo(rangeIndex, arc, in); + return readArcByDirectAddressing(arc, in, rangeIndex, presenceIndex); + } + + /** + * Reads a present direct addressing node arc, with the provided index in the label range and its + * corresponding presence index (which is the count of presence bits before it). + */ + private PrimitiveLongArc readArcByDirectAddressing( + PrimitiveLongArc arc, final BytesReader in, int rangeIndex, int presenceIndex) + throws IOException { + in.setPosition(arc.posArcsStart() - presenceIndex * (long) arc.bytesPerArc()); + arc.arcIdx = rangeIndex; + arc.presenceIndex = presenceIndex; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads the last arc of a direct addressing node. This method is equivalent to call {@link + * #readArcByDirectAddressing(PrimitiveLongArc, BytesReader, int)} with {@code rangeIndex} equal + * to {@code arc.numArcs() - 1}, but it is faster. + */ + public PrimitiveLongArc readLastArcByDirectAddressing(PrimitiveLongArc arc, final BytesReader in) + throws IOException { + assert BitTable.assertIsValid(arc, in); + int presenceIndex = BitTable.countBits(arc, in) - 1; + return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex); + } + + /** Reads the last arc of a continuous node. */ + public PrimitiveLongArc readLastArcByContinuous(PrimitiveLongArc arc, final BytesReader in) + throws IOException { + return readArcByContinuous(arc, in, arc.numArcs() - 1); + } + + /** Never returns null, but you should never call this if arc.isLast() is true. */ + public PrimitiveLongArc readNextRealArc(PrimitiveLongArc arc, final BytesReader in) + throws IOException { + + // TODO: can't assert this because we call from readFirstArc + // assert !flag(arc.flags, BIT_LAST_ARC); + + switch (arc.nodeFlags()) { + case ARCS_FOR_BINARY_SEARCH: + case ARCS_FOR_CONTINUOUS: + assert arc.bytesPerArc() > 0; + arc.arcIdx++; + assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs(); + in.setPosition(arc.posArcsStart() - arc.arcIdx() * (long) arc.bytesPerArc()); + arc.flags = in.readByte(); + break; + + case ARCS_FOR_DIRECT_ADDRESSING: + assert BitTable.assertIsValid(arc, in); + assert arc.arcIdx() == -1 || BitTable.isBitSet(arc.arcIdx(), arc, in); + int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); + return readArcByDirectAddressing(arc, in, nextIndex, arc.presenceIndex + 1); + + default: + // Variable length arcs - linear search. + assert arc.bytesPerArc() == 0; + in.setPosition(arc.nextArc()); + arc.flags = in.readByte(); + } + return readArc(arc, in); + } + + /** + * Reads an arc.
+ * Precondition: The arc flags byte has already been read and set; the given BytesReader is + * positioned just after the arc flags byte. + */ + private PrimitiveLongArc readArc(PrimitiveLongArc arc, BytesReader in) throws IOException { + if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) { + arc.label = arc.firstLabel() + arc.arcIdx(); + } else { + arc.label = readLabel(in); + } + + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + arc.output = outputs.read(in); + } else { + arc.output = outputs.getNoOutput(); + } + + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + arc.nextFinalOutput = outputs.readFinalOutput(in); + } else { + arc.nextFinalOutput = outputs.getNoOutput(); + } + + if (arc.flag(BIT_STOP_NODE)) { + if (arc.flag(BIT_FINAL_ARC)) { + arc.target = FINAL_END_NODE; + } else { + arc.target = NON_FINAL_END_NODE; + } + arc.nextArc = in.getPosition(); // Only useful for list. + } else if (arc.flag(BIT_TARGET_NEXT)) { + arc.nextArc = in.getPosition(); // Only useful for list. + // TODO: would be nice to make this lazy -- maybe + // caller doesn't need the target and is scanning arcs... + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc() == 0) { + // must scan + seekToNextNode(in); + } else { + int numArcs = + arc.nodeFlags == ARCS_FOR_DIRECT_ADDRESSING + ? BitTable.countBits(arc, in) + : arc.numArcs(); + in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * (long) numArcs); + } + } + arc.target = in.getPosition(); + } else { + arc.target = readUnpackedNodeTarget(in); + arc.nextArc = in.getPosition(); // Only useful for list. + } + return arc; + } + + static PrimitiveLongArc readEndArc(PrimitiveLongArc follow, PrimitiveLongArc arc) { + if (follow.isFinal()) { + if (follow.target() <= 0) { + arc.flags = PrimitiveLongFST.BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.output = follow.nextFinalOutput(); + arc.label = PrimitiveLongFST.END_LABEL; + return arc; + } else { + return null; + } + } + + // TODO: could we somehow [partially] tableize arc lookups + // like automaton? + + /** + * Finds an arc leaving the incoming arc, replacing the arc in place. This returns null if the arc + * was not found, else the incoming arc. + */ + public PrimitiveLongArc findTargetArc( + int labelToMatch, PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in) + throws IOException { + + if (labelToMatch == END_LABEL) { + if (follow.isFinal()) { + if (follow.target() <= 0) { + arc.flags = BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.output = follow.nextFinalOutput(); + arc.label = END_LABEL; + arc.nodeFlags = arc.flags; + return arc; + } else { + return null; + } + } + + if (!targetHasArcs(follow)) { + return null; + } + + in.setPosition(follow.target()); + + // System.out.println("fta label=" + (char) labelToMatch); + + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + arc.numArcs = in.readVInt(); // This is in fact the label range. + arc.bytesPerArc = in.readVInt(); + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + + int arcIndex = labelToMatch - arc.firstLabel(); + if (arcIndex < 0 || arcIndex >= arc.numArcs()) { + return null; // Before or after label range. + } else if (!BitTable.isBitSet(arcIndex, arc, in)) { + return null; // Arc missing in the range. + } + return readArcByDirectAddressing(arc, in, arcIndex); + } else if (flags == ARCS_FOR_BINARY_SEARCH) { + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + arc.posArcsStart = in.getPosition(); + + // Array is sparse; do binary search: + int low = 0; + int high = arc.numArcs() - 1; + while (low <= high) { + // System.out.println(" cycle"); + int mid = (low + high) >>> 1; + // +1 to skip over flags + in.setPosition(arc.posArcsStart() - (arc.bytesPerArc() * mid + 1)); + int midLabel = readLabel(in); + final int cmp = midLabel - labelToMatch; + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + arc.arcIdx = mid - 1; + // System.out.println(" found!"); + return readNextRealArc(arc, in); + } + } + return null; + } else if (flags == ARCS_FOR_CONTINUOUS) { + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readVInt(); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + int arcIndex = labelToMatch - arc.firstLabel(); + if (arcIndex < 0 || arcIndex >= arc.numArcs()) { + return null; // Before or after label range. + } + arc.arcIdx = arcIndex - 1; + return readNextRealArc(arc, in); + } + + // Linear scan + readFirstArcInfo(follow.target(), arc, in); + in.setPosition(arc.nextArc()); + while (true) { + assert arc.bytesPerArc() == 0; + flags = arc.flags = in.readByte(); + long pos = in.getPosition(); + int label = readLabel(in); + if (label == labelToMatch) { + in.setPosition(pos); + return readArc(arc, in); + } else if (label > labelToMatch) { + return null; + } else if (arc.isLast()) { + return null; + } else { + if (flag(flags, BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + if (flag(flags, BIT_STOP_NODE) == false && flag(flags, BIT_TARGET_NEXT) == false) { + readUnpackedNodeTarget(in); + } + } + } + } + + private void seekToNextNode(BytesReader in) throws IOException { + + while (true) { + + final int flags = in.readByte(); + readLabel(in); + + if (flag(flags, BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + + if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + + if (flag(flags, BIT_STOP_NODE) == false && flag(flags, BIT_TARGET_NEXT) == false) { + readUnpackedNodeTarget(in); + } + + if (flag(flags, BIT_LAST_ARC)) { + return; + } + } + } + + /** Returns a {@link BytesReader} for this FST, positioned at position 0. */ + public BytesReader getBytesReader() { + return fstReader.getReverseBytesReader(); + } + + /** Represent the FST metadata */ + public static final class PrimitiveLongFSTMetadata { + final INPUT_TYPE inputType; + final PrimitiveLongFSTOutputs outputs; + final int version; + // if non-null, this FST accepts the empty string and + // produces this output + Long emptyOutput; + long startNode; + long numBytes; + + public PrimitiveLongFSTMetadata( + INPUT_TYPE inputType, + PrimitiveLongFSTOutputs outputs, + Long emptyOutput, + long startNode, + int version, + long numBytes) { + this.inputType = inputType; + this.outputs = outputs; + this.emptyOutput = emptyOutput; + this.startNode = startNode; + this.version = version; + this.numBytes = numBytes; + } + } + + public static class PrimitiveLongFSTOutputs { + + private static final long NO_OUTPUT = 0L; + + private static final PrimitiveLongFSTOutputs singleton = new PrimitiveLongFSTOutputs(); + + private PrimitiveLongFSTOutputs() {} + + public static PrimitiveLongFSTOutputs getSingleton() { + return singleton; + } + + public long common(long output1, long output2) { + assert valid(output1); + assert valid(output2); + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } + } + + public long subtract(long output, long inc) { + assert valid(output); + assert valid(inc); + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output == inc) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + public long add(long prefix, long output) { + assert valid(prefix); + assert valid(output); + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } + + public void write(long output, DataOutput out) throws IOException { + assert valid(output); + out.writeVLong(output); + } + + public long read(DataInput in) throws IOException { + long v = in.readVLong(); + if (v == 0) { + return NO_OUTPUT; + } else { + return v; + } + } + + private boolean valid(long o) { + assert o == NO_OUTPUT || o > 0 : "o=" + o; + return true; + } + + public long getNoOutput() { + return NO_OUTPUT; + } + + public String outputToString(long output) { + return Long.toString(output); + } + + public String toString() { + return "PrimitiveLongFSTOutputs"; + } + + public long ramBytesUsed(Long output) { + return RamUsageEstimator.sizeOf(output); + } + + public void skipOutput(BytesReader in) throws IOException { + read(in); + } + + public void skipFinalOutput(BytesReader in) throws IOException { + read(in); + } + + public long readFinalOutput(BytesReader in) throws IOException { + return read(in); + } + + public void writeFinalOutput(long output, DataOutput out) throws IOException { + write(output, out); + } + } + + public static long get(PrimitiveLongFST primitiveLongFST, BytesRef input) throws IOException { + assert primitiveLongFST.metadata.inputType == PrimitiveLongFST.INPUT_TYPE.BYTE1; + + final BytesReader fstReader = primitiveLongFST.getBytesReader(); + + // TODO: would be nice not to alloc this on every lookup + final PrimitiveLongArc arc = primitiveLongFST.getFirstArc(new PrimitiveLongArc()); + + // Accumulate output as we go + long output = primitiveLongFST.outputs.getNoOutput(); + for (int i = 0; i < input.length; i++) { + if (primitiveLongFST.findTargetArc(input.bytes[i + input.offset] & 0xFF, arc, arc, fstReader) + == null) { + return -1; + } + output = primitiveLongFST.outputs.add(output, arc.output()); + } + + if (arc.isFinal()) { + return primitiveLongFST.outputs.add(output, arc.nextFinalOutput()); + } else { + return -1; + } + } +} From e06665ece54e28e199d0d39dba9b1638a52da30e Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sun, 26 Nov 2023 10:42:04 -0800 Subject: [PATCH 45/57] Allocate only one set of buffers in TermDataReader instead of one set per type --- .../codecs/lucene99/randomaccess/TermData.java | 5 +++-- .../randomaccess/TermDataReaderProvider.java | 17 +++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java index c72bef50451e..1b9a8c7406d8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -53,8 +53,9 @@ IntBlockTermState getTermStateWithBuffer( long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); long dataStartPos = metadata.getLong(metadataStartPos); - metadata.readBytesTo(metaDataBuffer, metadataStartPos + 8, codec.getMetadataBytesLength()); - BytesRef metadataBytesRef = new BytesRef(metaDataBuffer); + int metadataLength = codec.getMetadataBytesLength(); + metadata.readBytesTo(metaDataBuffer, metadataStartPos + 8, metadataLength); + BytesRef metadataBytesRef = new BytesRef(metaDataBuffer, 0, metadataLength); int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef); int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK)); diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java index a65e9b1304c5..45ba2b00b7c4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java @@ -76,14 +76,14 @@ record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCode public class TermDataReader { private final TermData[] termDataPerType; - private final byte[][] metaDataBufferPerType; + private final byte[] metaDataBuffer; - private final byte[][] dataBufferPerType; + private final byte[] dataBuffer; TermDataReader() throws IOException { termDataPerType = new TermData[termDataProviderAndCodecs.length]; - metaDataBufferPerType = new byte[termDataProviderAndCodecs.length][]; - dataBufferPerType = new byte[termDataProviderAndCodecs.length][]; + int maxMetadataLengthSeen = 0; + int maxDataLengthSeen = 0; for (int i = 0; i < termDataProviderAndCodecs.length; i++) { if (termDataProviderAndCodecs[i] == null) { @@ -95,9 +95,11 @@ public class TermDataReader { new TermData( termDataProvider.metadataProvider().newByteSlice(), termDataProvider.dataProvider().newByteSlice()); - metaDataBufferPerType[i] = new byte[codec.getMetadataBytesLength()]; - dataBufferPerType[i] = new byte[codec.getMaximumRecordSizeInBytes()]; + maxMetadataLengthSeen = Math.max(maxDataLengthSeen, codec.getMetadataBytesLength()); + maxDataLengthSeen = Math.max(maxMetadataLengthSeen, codec.getMaximumRecordSizeInBytes()); } + metaDataBuffer = new byte[maxMetadataLengthSeen]; + dataBuffer = new byte[maxDataLengthSeen]; } IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions) @@ -108,8 +110,7 @@ IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOp int typeId = termType.getId(); var codec = termDataProviderAndCodecs[termType.getId()].codec; IntBlockTermState termState = - termDataPerType[typeId].getTermStateWithBuffer( - codec, ord, metaDataBufferPerType[typeId], dataBufferPerType[typeId]); + termDataPerType[typeId].getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer); // need to filling some default values for the term state // in order to meet the expectations of the postings reader From 35af1d2cc969f95879966a9c2c5d37a4d276cbfa Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sun, 26 Nov 2023 11:00:32 -0800 Subject: [PATCH 46/57] Make TermDataReader lazily init its buffer and clone IndexInput --- .../randomaccess/TermDataReaderProvider.java | 59 +++++++++++-------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java index 45ba2b00b7c4..633c44cef09f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java @@ -74,43 +74,54 @@ TermDataReaderProvider build() { record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {} public class TermDataReader { - private final TermData[] termDataPerType; - - private final byte[] metaDataBuffer; - - private final byte[] dataBuffer; - - TermDataReader() throws IOException { - termDataPerType = new TermData[termDataProviderAndCodecs.length]; - int maxMetadataLengthSeen = 0; - int maxDataLengthSeen = 0; - - for (int i = 0; i < termDataProviderAndCodecs.length; i++) { - if (termDataProviderAndCodecs[i] == null) { - continue; + private TermData[] termDataPerType; + + private byte[] metaDataBuffer; + + private byte[] dataBuffer; + + void maybeInitBuffer() { + if (metaDataBuffer == null || dataBuffer == null) { + int maxMetadataLengthSeen = 0; + int maxDataLengthSeen = 0; + for (int i = 0; i < termDataProviderAndCodecs.length; i++) { + if (termDataProviderAndCodecs[i] == null) { + continue; + } + var codec = termDataProviderAndCodecs[i].codec; + maxMetadataLengthSeen = Math.max(maxDataLengthSeen, codec.getMetadataBytesLength()); + maxDataLengthSeen = Math.max(maxMetadataLengthSeen, codec.getMaximumRecordSizeInBytes()); } - var codec = termDataProviderAndCodecs[i].codec; - TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider; - termDataPerType[i] = + metaDataBuffer = new byte[maxMetadataLengthSeen]; + dataBuffer = new byte[maxDataLengthSeen]; + } + } + + TermData getTermData(int typeId) throws IOException { + if (termDataPerType == null) { + termDataPerType = new TermData[termDataProviderAndCodecs.length]; + } + if (termDataPerType[typeId] == null) { + TermDataProvider termDataProvider = termDataProviderAndCodecs[typeId].termDataProvider; + termDataPerType[typeId] = new TermData( termDataProvider.metadataProvider().newByteSlice(), termDataProvider.dataProvider().newByteSlice()); - maxMetadataLengthSeen = Math.max(maxDataLengthSeen, codec.getMetadataBytesLength()); - maxDataLengthSeen = Math.max(maxMetadataLengthSeen, codec.getMaximumRecordSizeInBytes()); } - metaDataBuffer = new byte[maxMetadataLengthSeen]; - dataBuffer = new byte[maxDataLengthSeen]; + return termDataPerType[typeId]; } IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions) throws IOException { assert termDataProviderAndCodecs[termType.getId()] != null; - assert termDataPerType[termType.getId()] != null; + + maybeInitBuffer(); int typeId = termType.getId(); - var codec = termDataProviderAndCodecs[termType.getId()].codec; + var codec = termDataProviderAndCodecs[typeId].codec; + var termData = getTermData(typeId); IntBlockTermState termState = - termDataPerType[typeId].getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer); + termData.getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer); // need to filling some default values for the term state // in order to meet the expectations of the postings reader From 79c0fb32fede7f5932055c217f504bc950092f7e Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sun, 26 Nov 2023 22:33:47 -0800 Subject: [PATCH 47/57] Implement BytesRefPrimitiveLongFSTEnum that works with a primitive long FST --- .../fst/BytesRefPrimitiveLongFSTEnum.java | 125 +++ .../lucene/util/fst/PrimitiveLongFSTEnum.java | 758 ++++++++++++++++++ .../java/org/apache/lucene/util/fst/Util.java | 29 + 3 files changed, 912 insertions(+) create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java new file mode 100644 index 000000000000..af34576b35b1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/** + * Enumerates all input (BytesRef) + output pairs in a {@link PrimitiveLongFST}. + * + * @lucene.experimental + */ +public final class BytesRefPrimitiveLongFSTEnum extends PrimitiveLongFSTEnum { + private final BytesRef current = new BytesRef(10); + private final InputOutput result = new InputOutput(); + private BytesRef target; + + /** Holds a single input (BytesRef) + output pair. */ + public static class InputOutput { + public BytesRef input; + public long output; + } + + /** + * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to + * the biggest term before target. + */ + public BytesRefPrimitiveLongFSTEnum(PrimitiveLongFST fst) { + super(fst); + result.input = current; + current.offset = 1; + } + + public InputOutput current() { + return result; + } + + public InputOutput next() throws IOException { + // System.out.println(" enum.next"); + doNext(); + return setResult(); + } + + /** Seeks to smallest term that's >= target. */ + public InputOutput seekCeil(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekCeil(); + return setResult(); + } + + /** Seeks to biggest term that's <= target. */ + public InputOutput seekFloor(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekFloor(); + return setResult(); + } + + /** + * Seeks to exactly this term, returning null if the term doesn't exist. This is faster than using + * {@link #seekFloor} or {@link #seekCeil} because it short-circuits as soon the match is not + * found. + */ + public InputOutput seekExact(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + if (doSeekExact()) { + assert upto == 1 + target.length; + return setResult(); + } else { + return null; + } + } + + @Override + protected int getTargetLabel() { + if (upto - 1 == target.length) { + return FST.END_LABEL; + } else { + return target.bytes[target.offset + upto - 1] & 0xFF; + } + } + + @Override + protected int getCurrentLabel() { + // current.offset fixed at 1 + return current.bytes[upto] & 0xFF; + } + + @Override + protected void setCurrentLabel(int label) { + current.bytes[upto] = (byte) label; + } + + @Override + protected void grow() { + current.bytes = ArrayUtil.grow(current.bytes, upto + 1); + } + + private InputOutput setResult() { + if (upto == 0) { + result.output = -1; + } else { + current.length = upto - 1; + result.output = output[upto]; + } + return result; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java new file mode 100644 index 000000000000..85c0815f964a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java @@ -0,0 +1,758 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import static org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc.BitTable; + +import java.io.IOException; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; + +/** + * Can next() and advance() through the terms in an {@link PrimitiveLongFST} + * + * @lucene.experimental + */ +abstract class PrimitiveLongFSTEnum { + protected final PrimitiveLongFST fst; + + protected PrimitiveLongArc[] arcs = new PrimitiveLongArc[10]; + + protected long[] output = new long[10]; + + protected final long NO_OUTPUT; + protected final FST.BytesReader fstReader; + + protected int upto; + int targetLength; + + /** + * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to + * the biggest term before target. + */ + PrimitiveLongFSTEnum(PrimitiveLongFST fst) { + this.fst = fst; + fstReader = fst.getBytesReader(); + NO_OUTPUT = fst.outputs.getNoOutput(); + fst.getFirstArc(getArc(0)); + output[0] = NO_OUTPUT; + } + + protected abstract int getTargetLabel(); + + protected abstract int getCurrentLabel(); + + protected abstract void setCurrentLabel(int label); + + protected abstract void grow(); + + /** Rewinds enum state to match the shared prefix between current term and target term */ + private void rewindPrefix() throws IOException { + if (upto == 0) { + // System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1), fstReader); + return; + } + // System.out.println(" rewind upto=" + upto + " vs targetLength=" + targetLength); + + final int currentLimit = upto; + upto = 1; + while (upto < currentLimit && upto <= targetLength + 1) { + final int cmp = getCurrentLabel() - getTargetLabel(); + if (cmp < 0) { + // seek forward + // System.out.println(" seek fwd"); + break; + } else if (cmp > 0) { + // seek backwards -- reset this arc to the first arc + final PrimitiveLongArc arc = getArc(upto); + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + // System.out.println(" seek first arc"); + break; + } + upto++; + } + // System.out.println(" fall through upto=" + upto); + } + + protected void doNext() throws IOException { + // System.out.println("FE: next upto=" + upto); + if (upto == 0) { + // System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1), fstReader); + } else { + // pop + // System.out.println(" check pop curArc target=" + arcs[upto].target + " label=" + + // arcs[upto].label + " isLast?=" + arcs[upto].isLast()); + while (arcs[upto].isLast()) { + upto--; + if (upto == 0) { + // System.out.println(" eof"); + return; + } + } + fst.readNextArc(arcs[upto], fstReader); + } + + pushFirst(); + } + + // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + + /** Seeks to smallest term that's >= target. */ + protected void doSeekCeil() throws IOException { + + // System.out.println(" advance len=" + target.length + " curlen=" + current.length); + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // System.out.println("FE.seekCeil upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + // System.out.println(" after rewind upto=" + upto); + + PrimitiveLongArc arc = getArc(upto); + // System.out.println(" init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while (arc != null) { + int targetLabel = getTargetLabel(); + // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) + // arc.label + ") vs targetLabel=" + targetLabel); + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + // Arcs are in an array + final FST.BytesReader in = fst.getBytesReader(); + if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in); + } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + arc = doSeekCeilArrayPacked(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + arc = doSeekCeilArrayContinuous(arc, targetLabel, in); + } + } else { + arc = doSeekCeilList(arc, targetLabel); + } + } + } + + private PrimitiveLongArc doSeekCeilArrayContinuous( + final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in) + throws IOException { + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex >= arc.numArcs()) { + rollbackToLastForkThenPush(); + return null; + } else { + if (targetIndex < 0) { + fst.readArcByContinuous(arc, in, 0); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } else { + fst.readArcByContinuous(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + } + } + + private PrimitiveLongArc doSeekCeilArrayDirectAddressing( + final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in) + throws IOException { + // The array is addressed directly by label, with presence bits to compute the actual arc + // offset. + + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex >= arc.numArcs()) { + rollbackToLastForkThenPush(); + return null; + } else { + if (targetIndex < 0) { + targetIndex = -1; + } else if (BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + // Not found, return the next arc (ceil). + int ceilIndex = BitTable.nextBitSet(targetIndex, arc, in); + assert ceilIndex != -1; + fst.readArcByDirectAddressing(arc, in, ceilIndex); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } + } + + private PrimitiveLongArc doSeekCeilArrayPacked( + final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in) + throws IOException { + // The array is packed -- use binary search to find the target. + int idx = Util.binarySearch(fst, arc, targetLabel); + if (idx >= 0) { + // Match + fst.readArcByIndex(arc, in, idx); + assert arc.arcIdx() == idx; + assert arc.label() == targetLabel + : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx; + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + // Dead end + fst.readArcByIndex(arc, in, idx - 1); + assert arc.isLast(); + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while (true) { + if (upto == 0) { + return null; + } + final PrimitiveLongArc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return null; + } + upto--; + } + } else { + // Ceiling - arc with least higher label + fst.readArcByIndex(arc, in, idx); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } + } + + private PrimitiveLongArc doSeekCeilList(final PrimitiveLongArc arc, final int targetLabel) + throws IOException { + // Arcs are not array'd -- must do linear scan: + if (arc.label() == targetLabel) { + // recurse + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (arc.label() > targetLabel) { + pushFirst(); + return null; + } else if (arc.isLast()) { + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while (true) { + if (upto == 0) { + return null; + } + final PrimitiveLongArc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return null; + } + upto--; + } + } else { + // keep scanning + // System.out.println(" next scan"); + fst.readNextArc(arc, fstReader); + } + return arc; + } + + // Todo: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + /** Seeks to largest term that's <= target. */ + void doSeekFloor() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + // System.out.println("FE: seek floor upto=" + upto); + + // Save CPU by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + // System.out.println("FE: after rewind upto=" + upto); + + PrimitiveLongArc arc = getArc(upto); + + // System.out.println("FE: init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while (arc != null) { + // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) + // arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + + // arc.bytesPerArc); + int targetLabel = getTargetLabel(); + + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + // Arcs are in an array + final FST.BytesReader in = fst.getBytesReader(); + if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in); + } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + arc = doSeekFloorArrayPacked(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + arc = doSeekFloorContinuous(arc, targetLabel, in); + } + } else { + arc = doSeekFloorList(arc, targetLabel); + } + } + } + + private PrimitiveLongArc doSeekFloorContinuous( + PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException { + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex < 0) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else if (targetIndex >= arc.numArcs()) { + // After last arc. + fst.readLastArcByContinuous(arc, in); + assert arc.label() < targetLabel; + assert arc.isLast(); + pushLast(); + return null; + } else { + // Within label range. + fst.readArcByContinuous(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + } + + private PrimitiveLongArc doSeekFloorArrayDirectAddressing( + PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException { + // The array is addressed directly by label, with presence bits to compute the actual arc + // offset. + + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex < 0) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else if (targetIndex >= arc.numArcs()) { + // After last arc. + fst.readLastArcByDirectAddressing(arc, in); + assert arc.label() < targetLabel; + assert arc.isLast(); + pushLast(); + return null; + } else { + // Within label range. + if (BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + // Scan backwards to find a floor arc. + int floorIndex = BitTable.previousBitSet(targetIndex, arc, in); + assert floorIndex != -1; + fst.readArcByDirectAddressing(arc, in, floorIndex); + assert arc.label() < targetLabel; + assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; + pushLast(); + return null; + } + } + + /** + * Target is beyond the last arc, out of label range. Dead end (target is after the last arc); + * rollback to last fork then push + */ + private void rollbackToLastForkThenPush() throws IOException { + upto--; + while (true) { + if (upto == 0) { + return; + } + final PrimitiveLongArc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (!prevArc.isLast()) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return; + } + upto--; + } + } + + /** + * Backtracks until it finds a node which first arc is before our target label.` Then on the node, + * finds the arc just before the targetLabel. + * + * @return null to continue the seek floor recursion loop. + */ + private PrimitiveLongArc backtrackToFloorArc( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + while (true) { + // First, walk backwards until we find a node which first arc is before our target label. + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + if (arc.label() < targetLabel) { + // Then on this node, find the arc just before the targetLabel. + if (!arc.isLast()) { + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + findNextFloorArcBinarySearch(arc, targetLabel, in); + } else if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + findNextFloorArcDirectAddressing(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + findNextFloorArcContinuous(arc, targetLabel, in); + } + } else { + while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) { + fst.readNextArc(arc, fstReader); + } + } + } + assert arc.label() < targetLabel; + assert arc.isLast() || fst.readNextArcLabel(arc, in) >= targetLabel; + pushLast(); + return null; + } + upto--; + if (upto == 0) { + return null; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } + + /** + * Finds and reads an arc on the current node which label is strictly less than the given label. + * Skips the first arc, finds next floor arc; or none if the floor arc is the first arc itself (in + * this case it has already been read). + * + *

Precondition: the given arc is the first arc of the node. + */ + private void findNextFloorArcDirectAddressing( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING; + assert arc.label() != FST.END_LABEL; + assert arc.label() == arc.firstLabel(); + if (arc.numArcs() > 1) { + int targetIndex = targetLabel - arc.firstLabel(); + assert targetIndex >= 0; + if (targetIndex >= arc.numArcs()) { + // Beyond last arc. Take last arc. + fst.readLastArcByDirectAddressing(arc, in); + } else { + // Take the preceding arc, even if the target is present. + int floorIndex = BitTable.previousBitSet(targetIndex, arc, in); + if (floorIndex > 0) { + fst.readArcByDirectAddressing(arc, in, floorIndex); + } + } + } + } + + /** Same as {@link #findNextFloorArcDirectAddressing} for continuous node. */ + private void findNextFloorArcContinuous( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS; + assert arc.label() != FST.END_LABEL; + assert arc.label() == arc.firstLabel(); + if (arc.numArcs() > 1) { + int targetIndex = targetLabel - arc.firstLabel(); + assert targetIndex >= 0; + if (targetIndex >= arc.numArcs()) { + // Beyond last arc. Take last arc. + fst.readLastArcByContinuous(arc, in); + } else { + fst.readArcByContinuous(arc, in, targetIndex - 1); + } + } + } + + /** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */ + private void findNextFloorArcBinarySearch( + PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH; + assert arc.label() != FST.END_LABEL; + assert arc.arcIdx() == 0; + if (arc.numArcs() > 1) { + int idx = Util.binarySearch(fst, arc, targetLabel); + assert idx != -1; + if (idx > 1) { + fst.readArcByIndex(arc, in, idx - 1); + } else if (idx < -2) { + fst.readArcByIndex(arc, in, -2 - idx); + } + } + } + + private PrimitiveLongArc doSeekFloorArrayPacked( + PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException { + // Arcs are fixed array -- use binary search to find the target. + int idx = Util.binarySearch(fst, arc, targetLabel); + + if (idx >= 0) { + // Match -- recurse + // System.out.println(" match! arcIdx=" + idx); + fst.readArcByIndex(arc, in, idx); + assert arc.arcIdx() == idx; + assert arc.label() == targetLabel + : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx; + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (idx == -1) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else { + // There is a floor arc; idx will be (-1 - (floor + 1)). + fst.readArcByIndex(arc, in, -2 - idx); + assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; + assert arc.label() < targetLabel + : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel; + pushLast(); + return null; + } + } + + private PrimitiveLongArc doSeekFloorList(PrimitiveLongArc arc, int targetLabel) + throws IOException { + if (arc.label() == targetLabel) { + // Match -- recurse + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (arc.label() > targetLabel) { + // TODO: if each arc could somehow read the arc just + // before, we can save this re-scan. The ceil case + // doesn't need this because it reads the next arc + // instead: + while (true) { + // First, walk backwards until we find a first arc + // that's before our target label: + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + if (arc.label() < targetLabel) { + // Then, scan forwards to the arc just before + // the targetLabel: + while (!arc.isLast() && fst.readNextArcLabel(arc, fstReader) < targetLabel) { + fst.readNextArc(arc, fstReader); + } + pushLast(); + return null; + } + upto--; + if (upto == 0) { + return null; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } else if (!arc.isLast()) { + // System.out.println(" check next label=" + fst.readNextArcLabel(arc) + " (" + (char) + // fst.readNextArcLabel(arc) + ")"); + if (fst.readNextArcLabel(arc, fstReader) > targetLabel) { + pushLast(); + return null; + } else { + // keep scanning + return fst.readNextArc(arc, fstReader); + } + } else { + pushLast(); + return null; + } + } + + /** Seeks to exactly target term. */ + boolean doSeekExact() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // System.out.println("FE: seek exact upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + // System.out.println("FE: after rewind upto=" + upto); + PrimitiveLongArc arc = getArc(upto - 1); + int targetLabel = getTargetLabel(); + + final FST.BytesReader fstReader = fst.getBytesReader(); + + while (true) { + // System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); + final PrimitiveLongArc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader); + if (nextArc == null) { + // short circuit + // upto--; + // upto = 0; + fst.readFirstTargetArc(arc, getArc(upto), fstReader); + // System.out.println(" no match upto=" + upto); + return false; + } + // Match -- recurse: + output[upto] = fst.outputs.add(output[upto - 1], nextArc.output()); + if (targetLabel == FST.END_LABEL) { + // System.out.println(" return found; upto=" + upto + " output=" + output[upto] + " + // nextArc=" + nextArc.isLast()); + return true; + } + setCurrentLabel(targetLabel); + incr(); + targetLabel = getTargetLabel(); + arc = nextArc; + } + } + + private void incr() { + upto++; + grow(); + if (arcs.length <= upto) { + @SuppressWarnings({"rawtypes", "unchecked"}) + final PrimitiveLongArc[] newArcs = + new PrimitiveLongArc + [ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, newArcs, 0, arcs.length); + arcs = newArcs; + } + if (output.length <= upto) { + @SuppressWarnings({"rawtypes", "unchecked"}) + final long[] newOutput = + new long[ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(output, 0, newOutput, 0, output.length); + output = newOutput; + } + } + + // Appends current arc, and then recurses from its target, + // appending first arc all the way to the final node + private void pushFirst() throws IOException { + + PrimitiveLongArc arc = arcs[upto]; + assert arc != null; + + while (true) { + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (arc.label() == FST.END_LABEL) { + // Final node + break; + } + // System.out.println(" pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + + // fst.outputs.outputToString(output[upto])); + setCurrentLabel(arc.label()); + incr(); + + final PrimitiveLongArc nextArc = getArc(upto); + fst.readFirstTargetArc(arc, nextArc, fstReader); + arc = nextArc; + } + } + + // Recurses from current arc, appending last arc all the + // way to the first final node + private void pushLast() throws IOException { + + PrimitiveLongArc arc = arcs[upto]; + assert arc != null; + + while (true) { + setCurrentLabel(arc.label()); + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (arc.label() == FST.END_LABEL) { + // Final node + break; + } + incr(); + + arc = fst.readLastTargetArc(arc, getArc(upto), fstReader); + } + } + + private PrimitiveLongArc getArc(int idx) { + if (arcs[idx] == null) { + arcs[idx] = new PrimitiveLongArc(); + } + return arcs[idx]; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 9fdc460d0583..740460679668 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -934,4 +934,33 @@ static int binarySearch(FST fst, FST.Arc arc, int targetLabel) throws } return -1 - low; } + + /** Same as {@link Util#binarySearch(FST, Arc, int)} but for {@link PrimitiveLongFST} */ + static int binarySearch( + PrimitiveLongFST fst, PrimitiveLongFST.PrimitiveLongArc arc, int targetLabel) + throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH + : "Arc is not encoded as packed array for binary search (nodeFlags=" + + arc.nodeFlags() + + ")"; + BytesReader in = fst.getBytesReader(); + int low = arc.arcIdx(); + int mid; + int high = arc.numArcs() - 1; + while (low <= high) { + mid = (low + high) >>> 1; + in.setPosition(arc.posArcsStart()); + in.skipBytes((long) arc.bytesPerArc() * mid + 1); + final int midLabel = fst.readLabel(in); + final int cmp = midLabel - targetLabel; + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + return -1 - low; + } } From b74a05dedef9c13b576b68621d69e83848b5f644 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sun, 26 Nov 2023 23:35:16 -0800 Subject: [PATCH 48/57] Fix getFirstArc() bug in PrimitiveLongFST. --- .../lucene/util/fst/PrimitiveLongFST.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java index c4a188fc58e6..900675090f97 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java @@ -503,11 +503,10 @@ public void save(DataOutput metaOut, DataOutput out) throws IOException { */ public void saveMetadata(DataOutput metaOut) throws IOException { CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); - - // Accepts empty string - metaOut.writeByte((byte) 1); - if (metadata.emptyOutput != null) { + // Accepts empty string + metaOut.writeByte((byte) 1); + // Serialize empty-string output: ByteBuffersDataOutput ros = new ByteBuffersDataOutput(); outputs.writeFinalOutput(metadata.emptyOutput.longValue(), ros); @@ -607,14 +606,16 @@ private void readPresenceBytes(PrimitiveLongArc arc, BytesReader in) throws IOEx public PrimitiveLongArc getFirstArc(PrimitiveLongArc arc) { long NO_OUTPUT = outputs.getNoOutput(); - arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; if (metadata.emptyOutput != null) { + arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; arc.nextFinalOutput = metadata.emptyOutput.longValue(); + if (metadata.emptyOutput.longValue() != NO_OUTPUT) { + arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); + } + } else { + arc.flags = BIT_LAST_ARC; + arc.nextFinalOutput = NO_OUTPUT; } - if (metadata.emptyOutput != null && metadata.emptyOutput.longValue() != NO_OUTPUT) { - arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); - } - arc.output = NO_OUTPUT; // If there are no nodes, ie, the FST only accepts the From f328e9f11e1eabee2db52068d45bc05cb6249209 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 27 Nov 2023 14:33:37 -0800 Subject: [PATCH 49/57] Reuse single IntBlockTermState in TermDataReader --- .../lucene99/randomaccess/TermData.java | 13 +++++++--- .../randomaccess/TermDataReaderProvider.java | 4 +++- .../lucene99/randomaccess/TermStateCodec.java | 11 +++++++++ .../randomaccess/TermStateCodecImpl.java | 24 +++++++++++++++---- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java index 1b9a8c7406d8..06cf69da9aa1 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java @@ -47,8 +47,13 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex); } - IntBlockTermState getTermStateWithBuffer( - TermStateCodec codec, long ord, byte[] metaDataBuffer, byte[] dataBuffer) throws IOException { + IntBlockTermState getTermStateWithBufferAndReuse( + TermStateCodec codec, + long ord, + byte[] metaDataBuffer, + byte[] dataBuffer, + IntBlockTermState reuse) + throws IOException { long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK; long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8); long dataStartPos = metadata.getLong(metadataStartPos); @@ -67,6 +72,8 @@ IntBlockTermState getTermStateWithBuffer( data.readBytesTo(dataBuffer, dataStartPos + dataBitIndex / 8, numBytesToRead); BytesRef dataBytesRef = new BytesRef(dataBuffer, 0, numBytesToRead); - return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex); + codec.decodeAtWithReuse( + metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex, reuse); + return reuse; } } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java index 633c44cef09f..7d66fcd6abc6 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java @@ -80,6 +80,8 @@ public class TermDataReader { private byte[] dataBuffer; + private IntBlockTermState reuse = new IntBlockTermState(); + void maybeInitBuffer() { if (metaDataBuffer == null || dataBuffer == null) { int maxMetadataLengthSeen = 0; @@ -121,7 +123,7 @@ IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOp var codec = termDataProviderAndCodecs[typeId].codec; var termData = getTermData(typeId); IntBlockTermState termState = - termData.getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer); + termData.getTermStateWithBufferAndReuse(codec, ord, metaDataBuffer, dataBuffer, reuse); // need to filling some default values for the term state // in order to meet the expectations of the postings reader diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java index 1ef79ab7f158..081b5917b3c4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java @@ -85,4 +85,15 @@ IntBlockTermState decodeWithinBlock( */ IntBlockTermState decodeAt( BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex); + + /** + * Like {@link TermStateCodec#decodeAt(BytesRef, BytesRef, BitUnpacker, int)} but with a caller + * provided `IntBlockTermState` instead of returning a allocated one. + */ + void decodeAtWithReuse( + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker, + int startBitIndex, + IntBlockTermState reuse); } diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java index adef80cba696..15fa3cbd9dde 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java @@ -212,11 +212,27 @@ public IntBlockTermState decodeWithinBlock( @Override public IntBlockTermState decodeAt( BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex) { - assert metadataBytes.length == this.metadataBytesLength; - int upto = metadataBytes.offset; IntBlockTermState decoded = new IntBlockTermState(); + decodeAtWithReuse(metadataBytes, dataBytes, bitUnpacker, startBitIndex, decoded); + + return decoded; + } + + @Override + public void decodeAtWithReuse( + BytesRef metadataBytes, + BytesRef dataBytes, + BitUnpacker bitUnpacker, + int startBitIndex, + IntBlockTermState reuse) { + assert metadataBytes.length == this.metadataBytesLength; + + reuse.lastPosBlockOffset = -1; + reuse.skipOffset = -1; + reuse.singletonDocID = -1; + int upto = metadataBytes.offset; for (int i = 0; i < components.length; i++) { var component = components[i]; int bitWidth = metadataBytes.bytes[upto++]; @@ -225,11 +241,9 @@ public IntBlockTermState decodeAt( val += (long) BitUtil.VH_LE_LONG.get(metadataBytes.bytes, upto); upto += 8; } - component.setTargetValue(decoded, val); + component.setTargetValue(reuse, val); startBitIndex += bitWidth; } - - return decoded; } private record Metadata(byte bitWidth, long referenceValue) {} From 0aadef5d7aa0ad462df38c75655c9f725f3873cc Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sun, 26 Nov 2023 20:00:58 -0800 Subject: [PATCH 50/57] Don't create slice description when requesting random-access input slice Profiling show lots of allocation to build a name for such slice --- lucene/core/src/java/org/apache/lucene/store/IndexInput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java index 3f703bc54b26..4307376cffbf 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java @@ -141,7 +141,7 @@ protected String getFullSliceDescription(String sliceDescription) { * implements absolute reads as seek+read. */ public RandomAccessInput randomAccessSlice(long offset, long length) throws IOException { - final IndexInput slice = slice("randomaccess", offset, length); + final IndexInput slice = slice(null, offset, length); if (slice instanceof RandomAccessInput) { // slice() already supports random access return (RandomAccessInput) slice; From e70e712707ac342b788dc2a86b61c2e092c7320a Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sun, 26 Nov 2023 22:40:18 -0800 Subject: [PATCH 51/57] Use primitive long FST for term lookup to avoid allocation from boxing-unboxing --- .../lucene99/randomaccess/RandomAccessTermsDict.java | 7 ++++--- .../sandbox/codecs/lucene99/randomaccess/TermsImpl.java | 8 ++++---- .../lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java index f767c2d4ed99..da48eb1f57e1 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java @@ -27,7 +27,7 @@ /** A term dictionary that offer random-access to read a specific term */ record RandomAccessTermsDict( TermsStats termsStats, - TermsIndex termsIndex, + TermsIndexPrimitive termsIndex, TermDataReaderProvider termDataReaderProvider, IndexOptions indexOptions) { @@ -52,9 +52,10 @@ static RandomAccessTermsDict deserialize( boolean hasPayloads = indexOptionsProvider.hasPayloads(termsStats.fieldNumber()); // (2) deserialize terms index - TermsIndex termsIndex = null; + TermsIndexPrimitive termsIndex = null; if (termsStats.size() > 0) { - termsIndex = TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true); + termsIndex = + TermsIndexPrimitive.deserialize(metaInput, termIndexInput, /* load off heap */ true); } // (3) deserialize all the term data by each TermType diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index d3977e4d5252..36387d47d32c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.fst.BytesRefFSTEnum; +import org.apache.lucene.util.fst.BytesRefPrimitiveLongFSTEnum; final class TermsImpl extends Terms { private final FieldInfo fieldInfo; @@ -120,9 +120,9 @@ final class RandomAccessTermsEnum extends TermsEnum { private IntBlockTermState termState; - private final BytesRefFSTEnum fstEnum; + private final BytesRefPrimitiveLongFSTEnum fstEnum; - private BytesRefFSTEnum.InputOutput fstSeekState; + private BytesRefPrimitiveLongFSTEnum.InputOutput fstSeekState; // Only set when seekExact(term, state) is called, because that will update // the termState but leave the fstSeekState out of sync. @@ -133,7 +133,7 @@ final class RandomAccessTermsEnum extends TermsEnum { RandomAccessTermsEnum() throws IOException { termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); - fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst()); + fstEnum = new BytesRefPrimitiveLongFSTEnum(termsDict.termsIndex().primitiveLongFST()); termDataReader = termsDict.termDataReaderProvider().newReader(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java index af34576b35b1..1aa5b03e7bb5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java @@ -115,11 +115,11 @@ protected void grow() { private InputOutput setResult() { if (upto == 0) { - result.output = -1; + return null; } else { current.length = upto - 1; result.output = output[upto]; + return result; } - return result; } } From 3d21b1a01c79fb8b17861403f2633bca96cf1047 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 27 Nov 2023 13:58:11 -0800 Subject: [PATCH 52/57] Make RAFDirectory resilient to `null` description when slicing --- .../src/java/org/apache/lucene/misc/store/RAFDirectory.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java index 420d6d40d6de..21ba55fd08ab 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java @@ -140,7 +140,8 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw throw new IllegalArgumentException( "slice() " + sliceDescription + " out of bounds: " + this); } - return new RAFIndexInput(sliceDescription, file, off + offset, length, getBufferSize()); + String description = sliceDescription == null ? toString() : sliceDescription; + return new RAFIndexInput(description, file, off + offset, length, getBufferSize()); } @Override From cd60a4f4338a314f86a327d243f4b634a0c65d67 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Tue, 28 Nov 2023 00:09:49 -0800 Subject: [PATCH 53/57] Implement interesect --- .../lucene99/randomaccess/TermsImpl.java | 388 +++++++++++++++++- .../java/org/apache/lucene/util/fst/Util.java | 86 +++- 2 files changed, 466 insertions(+), 8 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index 36387d47d32c..9ddb4a9c8d77 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -27,9 +27,19 @@ import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.automaton.ByteRunnable; +import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.fst.BytesRefPrimitiveLongFSTEnum; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PrimitiveLongFST; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongFSTOutputs; +import org.apache.lucene.util.fst.Util; final class TermsImpl extends Terms { private final FieldInfo fieldInfo; @@ -104,12 +114,13 @@ public TermsEnum iterator() throws IOException { return new RandomAccessTermsEnum(); } - // TODO: implement a more efficient version via FST - // @Override - // public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException - // { - // return null; - // } + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new RandomAccessIntersectTermsEnum(compiled, startTerm); + } final class RandomAccessTermsEnum extends TermsEnum { private AttributeSource attrs; @@ -245,4 +256,369 @@ public void seekExact(long ord) throws IOException { throw new UnsupportedOperationException("By ord lookup not supported."); } } + + final class RandomAccessIntersectTermsEnum extends TermsEnum { + private AttributeSource attrs; + + private BytesRefBuilder term; + + private boolean isTermStateCurrent; + + private IntBlockTermState termState; + + private final PrimitiveLongFST fst; + + private final FST.BytesReader fstReader; + + private final ByteRunnable fsa; + + private PrimitiveLongFSTOutputs fstOutputs = PrimitiveLongFSTOutputs.getSingleton(); + + private final TermDataReaderProvider.TermDataReader termDataReader; + + private Frame[] stack; + + private int level; + + private boolean pending; + + private final class Frame { + /* fst stats */ + PrimitiveLongArc fstArc; + long output; + /* automaton stats */ + int fsaState; + + Frame() { + this.fstArc = new PrimitiveLongArc(); + this.fsaState = -1; + } + + @Override + public String toString() { + return "arc=" + fstArc + " state=" + fsaState; + } + } + + /** + * Inspired by {@link org.apache.lucene.codecs.memory.FSTTermsReader}'s IntersectTermsEnum + */ + RandomAccessIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) + throws IOException { + termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); + fst = termsDict.termsIndex().primitiveLongFST(); + fstReader = fst.getBytesReader(); + fsa = compiled.getByteRunnable(); + termDataReader = termsDict.termDataReaderProvider().newReader(); + + stack = new Frame[16]; + for (int i = 0; i < stack.length; i++) { + this.stack[i] = new Frame(); + } + loadVirtualFrame(newFrame()); + level = 0; + + pushFrame(loadFirstFrame(newFrame())); + if (startTerm == null) { + pending = isAccept(topFrame()); + } else { + doSeekCeil(startTerm); + pending = + (term == null || !startTerm.equals(term.get())) + && isValid(topFrame()) + && isAccept(topFrame()); + } + } + + @Override + public BytesRef next() throws IOException { + if (pending) { + pending = false; + updateTermStateIfNeeded(); + return term(); + } + isTermStateCurrent = false; + DFS: + while (level > 0) { + Frame frame = newFrame(); + if (loadExpandFrame(topFrame(), frame) != null) { // has valid target + pushFrame(frame); + if (isAccept(frame)) { // gotcha + break; + } + continue; // check next target + } + frame = popFrame(); + while (level > 0) { + if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling + pushFrame(frame); + if (isAccept(frame)) { // gotcha + break DFS; + } + continue DFS; // check next target + } + frame = popFrame(); + } + return null; + } + if (term != null) { + updateTermStateIfNeeded(); + } + return term(); + } + + private long accumulateOutput() { + long output = 0; + int upto = 0; + Frame last, next; + last = stack[1]; + while (upto != level) { + upto++; + next = stack[upto]; + output = fstOutputs.add(next.output, output); + last = next; + } + if (last.fstArc.isFinal()) { + output = fstOutputs.add(output, last.fstArc.nextFinalOutput()); + } + return output; + } + + private BytesRef doSeekCeil(BytesRef target) throws IOException { + Frame frame = null; + int label, upto = 0, limit = target.length; + while (upto < limit) { // to target prefix, or ceil label (rewind prefix) + frame = newFrame(); + label = target.bytes[target.offset + upto] & 0xff; + frame = loadCeilFrame(label, topFrame(), frame); + if (frame == null || frame.fstArc.label() != label) { + break; + } + assert isValid(frame); // target must be fetched from automaton + pushFrame(frame); + upto++; + } + if (upto == limit) { // got target + return term(); + } + if (frame != null) { // got larger term('s prefix) + pushFrame(frame); + return isAccept(frame) ? term() : next(); + } + while (level > 0) { // got target's prefix, advance to larger term + frame = popFrame(); + while (level > 0 && !canRewind(frame)) { + frame = popFrame(); + } + if (loadNextFrame(topFrame(), frame) != null) { + pushFrame(frame); + return isAccept(frame) ? term() : next(); + } + } + return null; + } + + /** Load frame for target arc(node) on fst */ + Frame loadExpandFrame(Frame top, Frame frame) throws IOException { + if (!canGrow(top)) { + return null; + } + frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader); + frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); + // if (TEST) System.out.println(" loadExpand frame="+frame); + if (frame.fsaState == -1) { + return loadNextFrame(top, frame); + } + frame.output = frame.fstArc.output(); + return frame; + } + + Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { + PrimitiveLongArc arc = frame.fstArc; + arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader); + if (arc == null) { + return null; + } + frame.fsaState = fsa.step(top.fsaState, arc.label()); + if (frame.fsaState == -1) { + return loadNextFrame(top, frame); + } + frame.output = frame.fstArc.output(); + return frame; + } + + /** Load frame for sibling arc(node) on fst */ + Frame loadNextFrame(Frame top, Frame frame) throws IOException { + if (!canRewind(frame)) { + return null; + } + while (!frame.fstArc.isLast()) { + frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader); + frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); + if (frame.fsaState != -1) { + break; + } + } + if (frame.fsaState == -1) { + return null; + } + frame.output = frame.fstArc.output(); + return frame; + } + + void updateTermStateIfNeeded() throws IOException { + if (!isTermStateCurrent) { + long fstOutput = accumulateOutput(); + TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstOutput); + termState = + termDataReader.getTermState( + typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions()); + isTermStateCurrent = true; + } + } + + @Override + public AttributeSource attributes() { + if (attrs == null) { + attrs = new AttributeSource(); + } + return attrs; + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef term() throws IOException { + return term == null ? null : term.get(); + } + + @Override + public int docFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + updateTermStateIfNeeded(); + return termState.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.postings(fieldInfo, termState, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + updateTermStateIfNeeded(); + return lucene99PostingsReader.impacts(fieldInfo, termState, flags); + } + + @Override + public TermState termState() throws IOException { + updateTermStateIfNeeded(); + return termState.clone(); + } + + /** Virtual frame, never pop */ + Frame loadVirtualFrame(Frame frame) { + frame.output = fstOutputs.getNoOutput(); + frame.fsaState = -1; + return frame; + } + + Frame newFrame() { + if (level + 1 == stack.length) { + final Frame[] temp = + new Frame[ArrayUtil.oversize(level + 2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, temp, 0, stack.length); + for (int i = stack.length; i < temp.length; i++) { + temp[i] = new Frame(); + } + stack = temp; + } + return stack[level + 1]; + } + + Frame topFrame() { + return stack[level]; + } + + boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts + return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal(); + } + + boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject + return /*frame != null &&*/ frame.fsaState != -1; + } + + boolean canGrow(Frame frame) { // can walk forward on both fst&fsa + return frame.fsaState != -1 && PrimitiveLongFST.targetHasArcs(frame.fstArc); + } + + boolean canRewind(Frame frame) { // can jump to sibling + return !frame.fstArc.isLast(); + } + + void pushFrame(Frame frame) { + term = grow(frame.fstArc.label()); + level++; + } + + Frame popFrame() { + term = shrink(); + level--; + return stack[level + 1]; + } + + Frame loadFirstFrame(Frame frame) { + frame.fstArc = fst.getFirstArc(frame.fstArc); + frame.output = frame.fstArc.output(); + frame.fsaState = 0; + return frame; + } + + BytesRefBuilder grow(int label) { + if (term == null) { + term = new BytesRefBuilder(); + } else { + term.append((byte) label); + } + return term; + } + + BytesRefBuilder shrink() { + if (term.length() == 0) { + term = null; + } else { + term.setLength(term.length() - 1); + } + return term; + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException("By ord lookup not supported."); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(BytesRef target, TermState state) throws IOException { + throw new UnsupportedOperationException(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 740460679668..31c267234e69 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -32,6 +32,7 @@ import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; /** * Static helper methods. @@ -896,6 +897,88 @@ public static Arc readCeilArc( } } + /** + * TODO: can we work around this??? + * + *

Same as {@link Util#readCeilArc(int, FST, Arc, Arc, BytesReader)} but for {@link + * PrimitiveLongFST} + */ + public static PrimitiveLongArc readCeilArc( + int label, + PrimitiveLongFST fst, + PrimitiveLongArc follow, + PrimitiveLongArc arc, + BytesReader in) + throws IOException { + if (label == PrimitiveLongFST.END_LABEL) { + return PrimitiveLongFST.readEndArc(follow, arc); + } + if (!PrimitiveLongFST.targetHasArcs(follow)) { + return null; + } + fst.readFirstTargetArc(follow, arc, in); + if (arc.bytesPerArc() != 0 && arc.label() != PrimitiveLongFST.END_LABEL) { + if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_DIRECT_ADDRESSING) { + // Fixed length arcs in a direct addressing node. + int targetIndex = label - arc.label(); + if (targetIndex >= arc.numArcs()) { + return null; + } else if (targetIndex < 0) { + return arc; + } else { + if (PrimitiveLongArc.BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == label; + } else { + int ceilIndex = PrimitiveLongArc.BitTable.nextBitSet(targetIndex, arc, in); + assert ceilIndex != -1; + fst.readArcByDirectAddressing(arc, in, ceilIndex); + assert arc.label() > label; + } + return arc; + } + } else if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_CONTINUOUS) { + int targetIndex = label - arc.label(); + if (targetIndex >= arc.numArcs()) { + return null; + } else if (targetIndex < 0) { + return arc; + } else { + fst.readArcByContinuous(arc, in, targetIndex); + assert arc.label() == label; + return arc; + } + } + // Fixed length arcs in a binary search node. + int idx = binarySearch(fst, arc, label); + if (idx >= 0) { + return fst.readArcByIndex(arc, in, idx); + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + // DEAD END! + return null; + } + return fst.readArcByIndex(arc, in, idx); + } + + // Variable length arcs in a linear scan list, + // or special arc with label == FST.END_LABEL. + fst.readFirstRealTargetArc(follow.target(), arc, in); + + while (true) { + // System.out.println(" non-bs cycle"); + if (arc.label() >= label) { + // System.out.println(" found!"); + return arc; + } else if (arc.isLast()) { + return null; + } else { + fst.readNextRealArc(arc, in); + } + } + } + /** * Perform a binary search of Arcs encoded as a packed array * @@ -936,8 +1019,7 @@ static int binarySearch(FST fst, FST.Arc arc, int targetLabel) throws } /** Same as {@link Util#binarySearch(FST, Arc, int)} but for {@link PrimitiveLongFST} */ - static int binarySearch( - PrimitiveLongFST fst, PrimitiveLongFST.PrimitiveLongArc arc, int targetLabel) + static int binarySearch(PrimitiveLongFST fst, PrimitiveLongArc arc, int targetLabel) throws IOException { assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH : "Arc is not encoded as packed array for binary search (nodeFlags=" From 92392471aebd0ea14318f5ce23cde996da47e533 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Tue, 28 Nov 2023 22:15:00 -0800 Subject: [PATCH 54/57] Lazy decode termstate in IntersectEnum --- .../sandbox/codecs/lucene99/randomaccess/TermsImpl.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index 9ddb4a9c8d77..29567d83c8af 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -334,7 +334,6 @@ && isValid(topFrame()) public BytesRef next() throws IOException { if (pending) { pending = false; - updateTermStateIfNeeded(); return term(); } isTermStateCurrent = false; @@ -361,9 +360,6 @@ public BytesRef next() throws IOException { } return null; } - if (term != null) { - updateTermStateIfNeeded(); - } return term(); } From 05743d910ef7437b06de5bf03b85f9b8f2859c5b Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Fri, 8 Dec 2023 23:58:17 -0800 Subject: [PATCH 55/57] Minor non-functionarly change for TermsIndexBuilder --- .../codecs/lucene99/randomaccess/TermsIndexBuilder.java | 2 +- .../lucene99/randomaccess/TestTermsIndexBuilder.java | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java index 35dd42e81cd5..68bf66a3cbec 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java @@ -52,7 +52,7 @@ public TermsIndex build() throws IOException { return new TermsIndex(fstCompiler.compile()); } - private long encode(long ord, TermType termType) { + static long encode(long ord, TermType termType) { // use a single long to encode `ord` and `termType` // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0` // so it looks like this |... ord ...| termType| ... hasOutput ...| diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java index 9528dcd69b0d..1dad8688fc41 100644 --- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java +++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java @@ -29,7 +29,7 @@ public class TestTermsIndexBuilder extends LuceneTestCase { public void testBasics() throws IOException { - String[] test_terms = { + String[] termTerms = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", }; @@ -37,7 +37,7 @@ public void testBasics() throws IOException { Map termsToOrd = new HashMap<>(); Map typeCounters = new HashMap<>(); - for (String term : test_terms) { + for (String term : termTerms) { int termType = random().nextInt(TermType.NUM_TOTAL_TYPES); termsToType.put(term, termType); int ord = typeCounters.getOrDefault(termType, -1) + 1; @@ -46,7 +46,7 @@ public void testBasics() throws IOException { } TermsIndexBuilder builder = new TermsIndexBuilder(); - for (String term : test_terms) { + for (String term : termTerms) { BytesRef termBytes = new BytesRef(term); builder.addTerm(termBytes, TermType.fromId(termsToType.get(term))); } @@ -63,7 +63,7 @@ public void testBasics() throws IOException { TermsIndexPrimitive.deserialize( new ByteArrayDataInput(metaBytes), new ByteArrayDataInput(dataBytes), false); - for (String term : test_terms) { + for (String term : termTerms) { BytesRef termBytes = new BytesRef(term); TermsIndex.TypeAndOrd typeAndOrd = termsIndexPrimitive.getTerm(termBytes); From 93ed998638f2097076a36ee8ef58f08aca5c15b9 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Sat, 9 Dec 2023 00:00:37 -0800 Subject: [PATCH 56/57] implement FST + FSA intersection that leverages fast addressing of arc/transitions FST nodes have differetn variant. For non-variable length encoded node we can more efficiently lookup for a target label. Similarly, for FSAs the TransitionAccessor allows access to a list of [min, max] ranges in order, on which we can perform binary-search to advance to applicable transitions for a given target --- .../lucene99/randomaccess/TermsImpl.java | 283 +------------ .../util/automaton/NFARunAutomaton.java | 1 + .../lucene/util/fst/PrimitiveLongFSTEnum.java | 2 - .../fst/PrimitiveLongFSTIntersectEnum.java | 374 ++++++++++++++++++ .../TestPrimitiveLongFSTIntersectEnum.java | 309 +++++++++++++++ 5 files changed, 700 insertions(+), 269 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java index 29567d83c8af..aebcea20856c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java +++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java @@ -27,19 +27,13 @@ import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.automaton.ByteRunnable; import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.TransitionAccessor; import org.apache.lucene.util.fst.BytesRefPrimitiveLongFSTEnum; -import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PrimitiveLongFST; -import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; -import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongFSTOutputs; -import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.fst.PrimitiveLongFSTIntersectEnum; final class TermsImpl extends Terms { private final FieldInfo fieldInfo; @@ -260,211 +254,42 @@ public void seekExact(long ord) throws IOException { final class RandomAccessIntersectTermsEnum extends TermsEnum { private AttributeSource attrs; - private BytesRefBuilder term; - private boolean isTermStateCurrent; private IntBlockTermState termState; - private final PrimitiveLongFST fst; - - private final FST.BytesReader fstReader; - - private final ByteRunnable fsa; + private BytesRef term; - private PrimitiveLongFSTOutputs fstOutputs = PrimitiveLongFSTOutputs.getSingleton(); + private final PrimitiveLongFST fst; private final TermDataReaderProvider.TermDataReader termDataReader; - private Frame[] stack; - - private int level; - - private boolean pending; - - private final class Frame { - /* fst stats */ - PrimitiveLongArc fstArc; - long output; - /* automaton stats */ - int fsaState; - - Frame() { - this.fstArc = new PrimitiveLongArc(); - this.fsaState = -1; - } + private final PrimitiveLongFSTIntersectEnum fstFsaIntersectEnum; - @Override - public String toString() { - return "arc=" + fstArc + " state=" + fsaState; - } - } - - /** - * Inspired by {@link org.apache.lucene.codecs.memory.FSTTermsReader}'s IntersectTermsEnum - */ RandomAccessIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + TransitionAccessor transitionAccessor = compiled.getTransitionAccessor(); + // assert transitionAccessor.getNumTransitions(0) == 1; termState = (IntBlockTermState) lucene99PostingsReader.newTermState(); fst = termsDict.termsIndex().primitiveLongFST(); - fstReader = fst.getBytesReader(); - fsa = compiled.getByteRunnable(); termDataReader = termsDict.termDataReaderProvider().newReader(); - - stack = new Frame[16]; - for (int i = 0; i < stack.length; i++) { - this.stack[i] = new Frame(); - } - loadVirtualFrame(newFrame()); - level = 0; - - pushFrame(loadFirstFrame(newFrame())); - if (startTerm == null) { - pending = isAccept(topFrame()); - } else { - doSeekCeil(startTerm); - pending = - (term == null || !startTerm.equals(term.get())) - && isValid(topFrame()) - && isAccept(topFrame()); - } + fstFsaIntersectEnum = new PrimitiveLongFSTIntersectEnum(fst, compiled, startTerm); } @Override public BytesRef next() throws IOException { - if (pending) { - pending = false; - return term(); - } - isTermStateCurrent = false; - DFS: - while (level > 0) { - Frame frame = newFrame(); - if (loadExpandFrame(topFrame(), frame) != null) { // has valid target - pushFrame(frame); - if (isAccept(frame)) { // gotcha - break; - } - continue; // check next target - } - frame = popFrame(); - while (level > 0) { - if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling - pushFrame(frame); - if (isAccept(frame)) { // gotcha - break DFS; - } - continue DFS; // check next target - } - frame = popFrame(); - } - return null; - } - return term(); - } - - private long accumulateOutput() { - long output = 0; - int upto = 0; - Frame last, next; - last = stack[1]; - while (upto != level) { - upto++; - next = stack[upto]; - output = fstOutputs.add(next.output, output); - last = next; - } - if (last.fstArc.isFinal()) { - output = fstOutputs.add(output, last.fstArc.nextFinalOutput()); - } - return output; - } - - private BytesRef doSeekCeil(BytesRef target) throws IOException { - Frame frame = null; - int label, upto = 0, limit = target.length; - while (upto < limit) { // to target prefix, or ceil label (rewind prefix) - frame = newFrame(); - label = target.bytes[target.offset + upto] & 0xff; - frame = loadCeilFrame(label, topFrame(), frame); - if (frame == null || frame.fstArc.label() != label) { - break; - } - assert isValid(frame); // target must be fetched from automaton - pushFrame(frame); - upto++; - } - if (upto == limit) { // got target - return term(); - } - if (frame != null) { // got larger term('s prefix) - pushFrame(frame); - return isAccept(frame) ? term() : next(); - } - while (level > 0) { // got target's prefix, advance to larger term - frame = popFrame(); - while (level > 0 && !canRewind(frame)) { - frame = popFrame(); - } - if (loadNextFrame(topFrame(), frame) != null) { - pushFrame(frame); - return isAccept(frame) ? term() : next(); - } - } - return null; - } - - /** Load frame for target arc(node) on fst */ - Frame loadExpandFrame(Frame top, Frame frame) throws IOException { - if (!canGrow(top)) { - return null; - } - frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader); - frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); - // if (TEST) System.out.println(" loadExpand frame="+frame); - if (frame.fsaState == -1) { - return loadNextFrame(top, frame); - } - frame.output = frame.fstArc.output(); - return frame; - } - - Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { - PrimitiveLongArc arc = frame.fstArc; - arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader); - if (arc == null) { - return null; - } - frame.fsaState = fsa.step(top.fsaState, arc.label()); - if (frame.fsaState == -1) { - return loadNextFrame(top, frame); - } - frame.output = frame.fstArc.output(); - return frame; - } - - /** Load frame for sibling arc(node) on fst */ - Frame loadNextFrame(Frame top, Frame frame) throws IOException { - if (!canRewind(frame)) { - return null; - } - while (!frame.fstArc.isLast()) { - frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader); - frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); - if (frame.fsaState != -1) { - break; - } - } - if (frame.fsaState == -1) { - return null; + if (fstFsaIntersectEnum.next()) { + term = fstFsaIntersectEnum.getTerm(); + isTermStateCurrent = false; + } else { + term = null; } - frame.output = frame.fstArc.output(); - return frame; + return term; } void updateTermStateIfNeeded() throws IOException { if (!isTermStateCurrent) { - long fstOutput = accumulateOutput(); + long fstOutput = fstFsaIntersectEnum.getFSTOutput(); TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstOutput); termState = termDataReader.getTermState( @@ -488,7 +313,7 @@ public boolean seekExact(BytesRef text) throws IOException { @Override public BytesRef term() throws IOException { - return term == null ? null : term.get(); + return term; } @Override @@ -521,82 +346,6 @@ public TermState termState() throws IOException { return termState.clone(); } - /** Virtual frame, never pop */ - Frame loadVirtualFrame(Frame frame) { - frame.output = fstOutputs.getNoOutput(); - frame.fsaState = -1; - return frame; - } - - Frame newFrame() { - if (level + 1 == stack.length) { - final Frame[] temp = - new Frame[ArrayUtil.oversize(level + 2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(stack, 0, temp, 0, stack.length); - for (int i = stack.length; i < temp.length; i++) { - temp[i] = new Frame(); - } - stack = temp; - } - return stack[level + 1]; - } - - Frame topFrame() { - return stack[level]; - } - - boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts - return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal(); - } - - boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject - return /*frame != null &&*/ frame.fsaState != -1; - } - - boolean canGrow(Frame frame) { // can walk forward on both fst&fsa - return frame.fsaState != -1 && PrimitiveLongFST.targetHasArcs(frame.fstArc); - } - - boolean canRewind(Frame frame) { // can jump to sibling - return !frame.fstArc.isLast(); - } - - void pushFrame(Frame frame) { - term = grow(frame.fstArc.label()); - level++; - } - - Frame popFrame() { - term = shrink(); - level--; - return stack[level + 1]; - } - - Frame loadFirstFrame(Frame frame) { - frame.fstArc = fst.getFirstArc(frame.fstArc); - frame.output = frame.fstArc.output(); - frame.fsaState = 0; - return frame; - } - - BytesRefBuilder grow(int label) { - if (term == null) { - term = new BytesRefBuilder(); - } else { - term.append((byte) label); - } - return term; - } - - BytesRefBuilder shrink() { - if (term.length() == 0) { - term = null; - } else { - term.setLength(term.length() - 1); - } - return term; - } - @Override public long ord() throws IOException { throw new UnsupportedOperationException(); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java index 6ff52baebbc5..761cf9b77035 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java @@ -228,6 +228,7 @@ public void getTransition(int state, int index, Transition t) { } else { t.max = points[t.transitionUpto + 1] - 1; } + t.dest = dStates[t.source].transitions[t.transitionUpto]; } private class DState { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java index 85c0815f964a..b2fa07b23617 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java @@ -689,7 +689,6 @@ private void incr() { upto++; grow(); if (arcs.length <= upto) { - @SuppressWarnings({"rawtypes", "unchecked"}) final PrimitiveLongArc[] newArcs = new PrimitiveLongArc [ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; @@ -697,7 +696,6 @@ private void incr() { arcs = newArcs; } if (output.length <= upto) { - @SuppressWarnings({"rawtypes", "unchecked"}) final long[] newOutput = new long[ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(output, 0, newOutput, 0, output.length); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java new file mode 100644 index 000000000000..fb4bf16775fd --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.fst; + +import java.io.IOException; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.automaton.ByteRunnable; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.TransitionAccessor; +import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc; + +/** + * Can next() through the terms defined by the intersection of a {@link PrimitiveLongFST} + * + *

and {@link org.apache.lucene.util.automaton.CompiledAutomaton}. + * + *

Note: this can only seek forward. + * + * @lucene.experimental + */ +public final class PrimitiveLongFSTIntersectEnum { + + private final PrimitiveLongFST fst; + + private final FST.BytesReader fstBytesReader; + + private final ByteRunnable byteRunnable; + + private final TransitionAccessor transitionAccessor; + + /** DFS traversal states */ + private int currentLevel; + + private Frame[] stack; + + private BytesRefBuilder term = new BytesRefBuilder(); + + private long fstOutput; + + boolean pending; + + boolean isEmptyValidOutput; + + public PrimitiveLongFSTIntersectEnum( + PrimitiveLongFST fst, CompiledAutomaton automaton, BytesRef startTerm) throws IOException { + this.fst = fst; + this.fstBytesReader = fst.getBytesReader(); + this.byteRunnable = automaton.getByteRunnable(); + this.transitionAccessor = automaton.getTransitionAccessor(); + this.stack = new Frame[16]; + + var firstFrame = new Frame(); + firstFrame.fstNode = new PrimitiveLongArc(); + fst.getFirstArc(firstFrame.fstNode); + firstFrame.fsaState = 0; + stack[0] = firstFrame; + + if (startTerm != null) { + seekToStartTerm(startTerm); + } else { + isEmptyValidOutput = isAccept(firstFrame.fstNode, firstFrame.fsaState); + } + } + + public boolean next() throws IOException { + if (isEmptyValidOutput) { + fstOutput = fst.getEmptyOutput(); + isEmptyValidOutput = false; + return true; + } + while (currentLevel >= 0) { + Frame currentFrame = stack[currentLevel]; + + if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState) + || currentFrame.fstCandidateNode != null) { + // current frame has candidates + if (findNextIntersection(currentFrame)) { + term.grow(currentLevel + 1); + term.setByteAt(currentLevel, (byte) currentFrame.fstCandidateNode.label()); + term.setLength(currentLevel + 1); + // early prune - only push a new frame when the candidate has descendants + if (hasDescendants(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) { + Frame nextFrame = new Frame(); + nextFrame.fstNode = currentFrame.fstCandidateNode; + nextFrame.fsaState = currentFrame.fsaTransition.dest; + nextFrame.output = currentFrame.output + currentFrame.fstNode.output(); + ensureStackCapacity(); + stack[++currentLevel] = nextFrame; + } + // setup output + if (isAccept(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) { + fstOutput = + currentFrame.output // output before this node + + currentFrame.fstNode.output() // output of this node + // then output of the candidate + + currentFrame.fstCandidateNode.output() + + currentFrame.fstCandidateNode.nextFinalOutput(); + return true; + } + } else { + // no more intersection at this frame, pop frame + popFrame(); + } + } else { + // pop frame as the frame has no candidates + popFrame(); + } + } + return false; + } + + private void ensureStackCapacity() { + stack = ArrayUtil.grow(stack, currentLevel + 2); + } + + private void seekToStartTerm(BytesRef startTerm) throws IOException { + int length = startTerm.length; + + while (currentLevel < length) { + Frame currentFrame = stack[currentLevel]; + int target = startTerm.bytes[startTerm.offset + currentLevel] & 0xff; + + if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) { + initArcAndTransition(currentFrame, false); + fstAdvanceCeil(target, currentFrame.fstCandidateNode); + fsaAdvanceCeil(currentFrame, target); + + if (currentFrame.fstCandidateNode.label() == target + && (currentFrame.fsaTransition.min <= target + && target <= currentFrame.fsaTransition.max)) { + term.append((byte) target); + Frame nextFrame = new Frame(); + nextFrame.fstNode = currentFrame.fstCandidateNode; + nextFrame.fsaState = currentFrame.fsaTransition.dest; + nextFrame.output = currentFrame.output + currentFrame.fstNode.output(); + ensureStackCapacity(); + stack[++currentLevel] = nextFrame; + continue; + } + + if (currentFrame.fstCandidateNode.label() > target + || currentFrame.fsaTransition.min > target) { + pending = true; + } + break; + } else { + // all prefix upto this level is match, but the term to seek is longer + break; + } + } + } + + private void popFrame() { + currentLevel--; + term.setLength(currentLevel); + } + + private boolean isAccept(PrimitiveLongArc fstNode, int fsaState) { + return byteRunnable.isAccept(fsaState) && fstNode.isFinal(); + } + + private boolean hasDescendants(PrimitiveLongArc fstNode, int fsaState) { + return transitionAccessor.getNumTransitions(fsaState) > 0 + && PrimitiveLongFST.targetHasArcs(fstNode); + } + + private void initArcAndTransition(Frame frame, boolean advanceToFirstTransition) + throws IOException { + frame.fstCandidateNode = new PrimitiveLongArc(); + fst.readFirstRealTargetArc(frame.fstNode.target(), frame.fstCandidateNode, fstBytesReader); + + frame.fsaTransition = new Transition(); + frame.numTransitions = transitionAccessor.initTransition(frame.fsaState, frame.fsaTransition); + if (advanceToFirstTransition) { + transitionAccessor.getNextTransition(frame.fsaTransition); + frame.transitionUpto++; + } + } + + private boolean findNextIntersection(Frame frame) throws IOException { + if (frame.fstCandidateNode == null) { + // when called first time, init first FST arc and the FSA transition + initArcAndTransition(frame, true); + } else if (pending) { + pending = false; + } else { + // subsequent call, which implies we previously found an intersection. + // we need to advance the FST to avoid returning the same state. + // Advance FST not the FSA because FST arc has a single label, + // where FSA transition may accept a range of lables + if (frame.fstCandidateNode.isLast()) { + return false; + } + frame.fstCandidateNode = fst.readNextRealArc(frame.fstCandidateNode, fstBytesReader); + } + + while (true) { + if (frame.fstCandidateNode.label() < frame.fsaTransition.min) { + // advance FST + if (frame.fstCandidateNode.isLast()) { + // no more eligible FST arc at this level + return false; + } + // TODO: advance to first arc that has label >= fsaTransition.min + // frame.fstCandidateNode = + // fst.readNextRealArc(frame.fstCandidateNode, fstBytesReader); + if (fstAdvanceCeil(frame.fsaTransition.min, frame.fstCandidateNode) == false) { + return false; + } + } else if (frame.fstCandidateNode.label() > frame.fsaTransition.max) { + // advance FSA + if (frame.transitionUpto == frame.numTransitions) { + // no more eligible FSA transitions at this level + return false; + } + // TODO: advance FSA with binary search to fstNode.label() + // transitionAccessor.getNextTransition(frame.fsaTransition); + // frame.transitionUpto++; + fsaAdvanceCeil(frame, frame.fstCandidateNode.label()); + } else { + // can go deeper + return true; + } + } + } + + public BytesRef getTerm() { + return term.get(); + } + + public long getFSTOutput() { + return fstOutput; + } + + /** + * Advance to the arc whose label is greater or equal to the provided target. + * + * @return true, if found. + */ + private boolean fstAdvanceCeil(int target, PrimitiveLongArc /* mutates */ arc) + throws IOException { + if (arc.bytesPerArc() != 0 && arc.label() != PrimitiveLongFST.END_LABEL) { + if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_CONTINUOUS) { + int targetIndex = target - arc.label() + arc.arcIdx(); + if (targetIndex < 0) { + return false; + } else if (targetIndex >= arc.numArcs()) { + fst.readArcByContinuous(arc, fstBytesReader, arc.numArcs() - 1); + return false; + } else { + fst.readArcByContinuous(arc, fstBytesReader, targetIndex); + return true; + } + } else if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_DIRECT_ADDRESSING) { + // Fixed length arcs in a direct addressing node. + int targetIndex = target - arc.label() + arc.arcIdx(); + if (targetIndex >= arc.numArcs() || targetIndex < 0) { + return false; + } else if (targetIndex >= arc.numArcs()) { + fst.readArcByDirectAddressing(arc, fstBytesReader, arc.numArcs() - 1); + return false; + } else { + if (PrimitiveLongArc.BitTable.isBitSet(targetIndex, arc, fstBytesReader)) { + fst.readArcByDirectAddressing(arc, fstBytesReader, targetIndex); + } else { + int ceilIndex = PrimitiveLongArc.BitTable.nextBitSet(targetIndex, arc, fstBytesReader); + if (ceilIndex == -1) { + return false; + } + fst.readArcByDirectAddressing(arc, fstBytesReader, ceilIndex); + } + return true; + } + } + // Fixed length arcs in a binary search node. + int idx = Util.binarySearch(fst, arc, target); + if (idx >= 0) { + fst.readArcByIndex(arc, fstBytesReader, idx); + return true; + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + fst.readArcByIndex(arc, fstBytesReader, arc.numArcs() - 1); + // DEAD END! + return false; + } + fst.readArcByIndex(arc, fstBytesReader, idx); + return true; + } + + // Variable length arcs in a linear scan list, + // or special arc with label == FST.END_LABEL. + while (true) { + if (arc.label() >= target) { + return true; + } else if (arc.isLast()) { + return false; + } else { + fst.readNextRealArc(arc, fstBytesReader); + } + } + } + + private void fsaAdvanceCeil(Frame frame, int target) { + int low = frame.transitionUpto; + int high = frame.numTransitions; + Transition t = frame.fsaTransition; + + // invariant: target is between the min of [low, high) + int mid = 0; + while (high - low > 1) { + mid = (high + low) >>> 1; + transitionAccessor.getTransition(frame.fsaState, mid, t); + if (t.min > target) { + high = mid; + } else if (t.min < target) { + low = mid; + } else { + frame.transitionUpto = mid + 1; + return; + } + } + transitionAccessor.getTransition(frame.fsaState, low, t); + frame.transitionUpto = low + 1; + } + + private boolean fsaAdvanceCeilSlow(Frame frame, int target) { + while (frame.transitionUpto < frame.numTransitions) { + transitionAccessor.getNextTransition(frame.fsaTransition); + frame.transitionUpto++; + if (target <= frame.fsaTransition.max) { + return frame.fsaTransition.min <= target; + } + } + return false; + } + + /** + * We will maintain the state of conventional recursive DFS traversal algorithm, which is stack of + * frames. This class capture the state at each level. + */ + static final class Frame { + PrimitiveLongArc fstNode; + + PrimitiveLongArc fstCandidateNode; + + int fsaState; + + long output; + + Transition fsaTransition; + + int transitionUpto; + + int numTransitions; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java new file mode 100644 index 000000000000..a07e7bfae5e6 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.fst; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunnable; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.TransitionAccessor; + +public class TestPrimitiveLongFSTIntersectEnum extends LuceneTestCase { + + public void testBasics() throws IOException { + String[] testTerms = { + "!", "*", "+", "++", "+++b", "++c", "a", "b", "bb", "dd", + }; + + HashMap termOutputs = new HashMap<>(); + + IntsRefBuilder scratchInts = new IntsRefBuilder(); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build(); + + for (var term : testTerms) { + long output = random().nextLong(1, 1024); + termOutputs.put(term, output); + fstCompiler.add(Util.toIntsRef(new BytesRef(term), scratchInts), output); + // System.out.println(term + ": " + output); + } + + var boxedFst = fstCompiler.compile(); + + byte[] metaBytes = new byte[4096]; + byte[] dataBytes = new byte[4096]; + DataOutput metaOut = new ByteArrayDataOutput(metaBytes); + DataOutput dataOutput = new ByteArrayDataOutput(dataBytes); + + boxedFst.save(metaOut, dataOutput); + + PrimitiveLongFST primitiveLongFst = + new PrimitiveLongFST( + PrimitiveLongFST.readMetadata( + new ByteArrayDataInput(metaBytes), + PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()), + new ByteArrayDataInput(dataBytes)); + + // RegExp regExp = new RegExp("a([a-f]|[j-z])c", RegExp.NONE); + RegExp regExp = new RegExp("+*.", RegExp.NONE); + Automaton a = regExp.toAutomaton(); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(a); + + var byteRunnable = compiledAutomaton.getByteRunnable(); + var transitionAccessor = compiledAutomaton.getTransitionAccessor(); + // dfsAutomaton(byteRunnable, transitionAccessor, 0, ""); + + PrimitiveLongFST.PrimitiveLongArc firstArc = new PrimitiveLongFST.PrimitiveLongArc(); + System.out.println("---- recursive algo ----"); + dfsIntersectFsaFst( + primitiveLongFst, + primitiveLongFst.getBytesReader(), + primitiveLongFst.getFirstArc(firstArc), + "", + 0, + byteRunnable, + transitionAccessor, + 0); + + System.out.println("---- non-recursive algo ----"); + var intersectEnum = + new PrimitiveLongFSTIntersectEnum(primitiveLongFst, compiledAutomaton, null); + while (intersectEnum.next()) { + String term = intersectEnum.getTerm().utf8ToString(); + long actualOutput = intersectEnum.getFSTOutput(); + System.out.println( + term + " expected output:" + termOutputs.get(term) + " actual: " + actualOutput); + } + } + + void dfs( + PrimitiveLongFST fst, + FST.BytesReader in, + PrimitiveLongFST.PrimitiveLongArc currentLevelNode, + String path, + long acc) + throws IOException { + if (currentLevelNode.isFinal()) { + long output = acc + currentLevelNode.output() + currentLevelNode.nextFinalOutput(); + System.out.println(path + (char) currentLevelNode.label() + "raw output: " + output); + } + + if (PrimitiveLongFST.targetHasArcs(currentLevelNode)) { + String pathNext = + currentLevelNode.label() > 0 ? path + (char) currentLevelNode.label() : path; + long accNext = currentLevelNode.label() > 0 ? acc + currentLevelNode.output() : acc; + var nextLevelNode = new PrimitiveLongFST.PrimitiveLongArc(); + fst.readFirstRealTargetArc(currentLevelNode.target(), nextLevelNode, in); + dfs(fst, in, nextLevelNode, pathNext, accNext); + } + + if (currentLevelNode.isLast() == false) { + fst.readNextRealArc(currentLevelNode, in); + dfs(fst, in, currentLevelNode, path, acc); + } + } + + public void testAutomaton() { + RegExp regExp = new RegExp("+*.", RegExp.NONE); + Automaton a = regExp.toAutomaton(); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(a); + System.out.println("isFinite: " + compiledAutomaton.finite); + + var byteRunnable = compiledAutomaton.getByteRunnable(); + var transitionAccessor = compiledAutomaton.getTransitionAccessor(); + // dfsAutomaton(byteRunnable, transitionAccessor, 0, ""); + // dumpTransitionsViaNext(byteRunnable, transitionAccessor, 0, new HashSet<>()); + dumpTransitionsViaRA(byteRunnable, transitionAccessor, 0, new HashSet<>()); + } + + void dfsAutomaton( + ByteRunnable a, TransitionAccessor transitionAccessor, int currentLevelState, String path) { + if (a.isAccept(currentLevelState)) { + if (path.length() > 50) { + throw new RuntimeException(); + } + System.out.println("found: " + path); + } + + int currentLevelSize = transitionAccessor.getNumTransitions(currentLevelState); + for (int i = 0; i < currentLevelSize; i++) { + Transition t = new Transition(); + transitionAccessor.getNextTransition(t); + System.out.println( + "At: src: " + + t.source + + " [" + + t.min + + ", " + + t.max + + "] " + + "dest: " + + t.dest + + " is dest accept: " + + (a.isAccept(t.dest) ? "yes" : "no")); + for (int label = t.min; label <= t.max; label++) { + dfsAutomaton(a, transitionAccessor, t.dest, path + " " + label); + } + } + } + + void dumpTransitionsViaNext( + ByteRunnable a, + TransitionAccessor transitionAccessor, + int currentState, + Set seenStates) { + if (seenStates.contains(currentState)) { + return; + } + + seenStates.add(currentState); + + var t = new Transition(); + var numStates = transitionAccessor.initTransition(currentState, t); + + for (int i = 0; i < numStates; i++) { + transitionAccessor.getNextTransition(t); + System.out.println( + "At: src: " + + t.source + + " arcIdx: " + + i + + " [" + + t.min + + ", " + + t.max + + "] " + + "dest: " + + t.dest + + " is dest accept: " + + (a.isAccept(t.dest) ? "yes" : "no")); + dumpTransitionsViaNext(a, transitionAccessor, t.dest, seenStates); + } + } + + void dumpTransitionsViaRA( + ByteRunnable a, + TransitionAccessor transitionAccessor, + int currentState, + Set seenStates) { + if (seenStates.contains(currentState)) { + return; + } + + seenStates.add(currentState); + + var t = new Transition(); + var numStates = transitionAccessor.initTransition(currentState, t); + + // transitionAccessor.getTransition(currentState, numStates - 1, t); + for (int i = 0; i < numStates; i++) { + transitionAccessor.getTransition(currentState, i, t); + System.out.println( + "At: src: " + + t.source + + " arcIdx: " + + i + + " [" + + t.min + + ", " + + t.max + + "] " + + "dest: " + + t.dest + + " is dest accept: " + + (a.isAccept(t.dest) ? "yes" : "no")); + dumpTransitionsViaRA(a, transitionAccessor, t.dest, seenStates); + } + } + + void dfsIntersectFsaFst( + PrimitiveLongFST fst, + FST.BytesReader in, + PrimitiveLongFST.PrimitiveLongArc fstNode, + String path, + long acc, + ByteRunnable a, + TransitionAccessor transitionAccessor, + int fsaState) + throws IOException { + + if (a.isAccept(fsaState) && fstNode.isFinal()) { + // found + System.out.println(path + ": " + (acc + fstNode.output() + fstNode.nextFinalOutput())); + } + + Transition fsaTransition = new Transition(); + int numTransitions = transitionAccessor.initTransition(fsaState, fsaTransition); + + if (numTransitions <= 0 || !PrimitiveLongFST.targetHasArcs(fstNode)) { + return; + } + + int transitionUpto = 0; + var nextLevelFstNode = new PrimitiveLongFST.PrimitiveLongArc(); + fst.readFirstRealTargetArc(fstNode.target(), nextLevelFstNode, in); + transitionAccessor.getNextTransition(fsaTransition); + transitionUpto++; + + while (true) { + if (nextLevelFstNode.label() < fsaTransition.min) { + // advance FST + if (nextLevelFstNode.isLast()) { + // no more eligible FST arc at this level + break; + } + // TODO: advance to first arc that has label >= fsaTransition.min + nextLevelFstNode = fst.readNextRealArc(nextLevelFstNode, in); + } else if (nextLevelFstNode.label() > fsaTransition.max) { + // advance FSA + if (transitionUpto == numTransitions) { + // no more eligible FSA transitions at this level + return; + } + // TODO: advance FSA with binary search to fstNode.label() + transitionAccessor.getNextTransition(fsaTransition); + transitionUpto++; + } else { + // can go deeper + String pathNext = path + (char) nextLevelFstNode.label(); + long accNext = acc + fstNode.output(); + int nextFsaState = fsaTransition.dest; + dfsIntersectFsaFst( + fst, in, nextLevelFstNode, pathNext, accNext, a, transitionAccessor, nextFsaState); + if (nextLevelFstNode.isLast()) { + // no more candidate at this prefix + return; + } else { + // TODO: advance to first arc that has label >= fsaTransition.min + nextLevelFstNode = fst.readNextRealArc(nextLevelFstNode, in); + } + } + } + } +} From c7e1568f791a375ffa1dfbccedddf16e53313185 Mon Sep 17 00:00:00 2001 From: Tony Xu Date: Mon, 11 Dec 2023 17:04:30 -0800 Subject: [PATCH 57/57] Reuse stack frames to avoid allocating too many Arc and Transitions --- .../fst/PrimitiveLongFSTIntersectEnum.java | 51 +++++++++++-------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java index fb4bf16775fd..d7ca07581446 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java @@ -88,8 +88,7 @@ public boolean next() throws IOException { while (currentLevel >= 0) { Frame currentFrame = stack[currentLevel]; - if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState) - || currentFrame.fstCandidateNode != null) { + if (!currentFrame.isFresh || hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) { // current frame has candidates if (findNextIntersection(currentFrame)) { term.grow(currentLevel + 1); @@ -97,12 +96,7 @@ public boolean next() throws IOException { term.setLength(currentLevel + 1); // early prune - only push a new frame when the candidate has descendants if (hasDescendants(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) { - Frame nextFrame = new Frame(); - nextFrame.fstNode = currentFrame.fstCandidateNode; - nextFrame.fsaState = currentFrame.fsaTransition.dest; - nextFrame.output = currentFrame.output + currentFrame.fstNode.output(); - ensureStackCapacity(); - stack[++currentLevel] = nextFrame; + fillNextFrame(currentFrame); } // setup output if (isAccept(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) { @@ -137,8 +131,10 @@ private void seekToStartTerm(BytesRef startTerm) throws IOException { Frame currentFrame = stack[currentLevel]; int target = startTerm.bytes[startTerm.offset + currentLevel] & 0xff; - if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) { + if (currentFrame.numTransitions > 0 + || hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) { initArcAndTransition(currentFrame, false); + currentFrame.isFresh = false; fstAdvanceCeil(target, currentFrame.fstCandidateNode); fsaAdvanceCeil(currentFrame, target); @@ -146,12 +142,7 @@ private void seekToStartTerm(BytesRef startTerm) throws IOException { && (currentFrame.fsaTransition.min <= target && target <= currentFrame.fsaTransition.max)) { term.append((byte) target); - Frame nextFrame = new Frame(); - nextFrame.fstNode = currentFrame.fstCandidateNode; - nextFrame.fsaState = currentFrame.fsaTransition.dest; - nextFrame.output = currentFrame.output + currentFrame.fstNode.output(); - ensureStackCapacity(); - stack[++currentLevel] = nextFrame; + fillNextFrame(currentFrame); continue; } @@ -167,6 +158,23 @@ private void seekToStartTerm(BytesRef startTerm) throws IOException { } } + private void fillNextFrame(Frame currentFrame) { + ensureStackCapacity(); + Frame nextFrame; + // reuse previous allocations + if (stack[currentLevel + 1] == null) { + nextFrame = new Frame(); + } else { + nextFrame = stack[currentLevel + 1]; + nextFrame.numTransitions = 0; + nextFrame.isFresh = true; + } + nextFrame.fstNode = currentFrame.fstCandidateNode; + nextFrame.fsaState = currentFrame.fsaTransition.dest; + nextFrame.output = currentFrame.output + currentFrame.fstNode.output(); + stack[++currentLevel] = nextFrame; + } + private void popFrame() { currentLevel--; term.setLength(currentLevel); @@ -183,11 +191,9 @@ private boolean hasDescendants(PrimitiveLongArc fstNode, int fsaState) { private void initArcAndTransition(Frame frame, boolean advanceToFirstTransition) throws IOException { - frame.fstCandidateNode = new PrimitiveLongArc(); fst.readFirstRealTargetArc(frame.fstNode.target(), frame.fstCandidateNode, fstBytesReader); - - frame.fsaTransition = new Transition(); frame.numTransitions = transitionAccessor.initTransition(frame.fsaState, frame.fsaTransition); + frame.transitionUpto = 0; if (advanceToFirstTransition) { transitionAccessor.getNextTransition(frame.fsaTransition); frame.transitionUpto++; @@ -195,9 +201,10 @@ private void initArcAndTransition(Frame frame, boolean advanceToFirstTransition) } private boolean findNextIntersection(Frame frame) throws IOException { - if (frame.fstCandidateNode == null) { + if (frame.isFresh) { // when called first time, init first FST arc and the FSA transition initArcAndTransition(frame, true); + frame.isFresh = false; } else if (pending) { pending = false; } else { @@ -359,16 +366,18 @@ private boolean fsaAdvanceCeilSlow(Frame frame, int target) { static final class Frame { PrimitiveLongArc fstNode; - PrimitiveLongArc fstCandidateNode; + PrimitiveLongArc fstCandidateNode = new PrimitiveLongArc(); int fsaState; long output; - Transition fsaTransition; + Transition fsaTransition = new Transition(); int transitionUpto; int numTransitions; + + boolean isFresh = true; } }