From af115b95744d8b0e4cef3c3c82393aa58ffe15cd Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Tue, 19 Sep 2023 11:41:28 -0700
Subject: [PATCH 01/57] Setup no-op Lucene90RandomAcessDictionaryPostingsFormat

---
 lucene/core/src/java/module-info.java         |  1 +
 ...90RandomAcessDictionaryPostingsFormat.java | 81 +++++++++++++++++++
 .../Lucene90RandomAccessTermsReader.java      | 46 +++++++++++
 .../Lucene90RandomAccessTermsWriter.java      | 30 +++++++
 4 files changed, 158 insertions(+)
 create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java
 create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
 create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java

diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java
index f5d8cd275b79..bf67d687cfb9 100644
--- a/lucene/core/src/java/module-info.java
+++ b/lucene/core/src/java/module-info.java
@@ -35,6 +35,7 @@
   exports org.apache.lucene.codecs.lucene95;
   exports org.apache.lucene.codecs.lucene90.blocktree;
   exports org.apache.lucene.codecs.lucene90.compressing;
+  exports org.apache.lucene.codecs.lucene90.radomaccess;
   exports org.apache.lucene.codecs.perfield;
   exports org.apache.lucene.document;
   exports org.apache.lucene.geo;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java
new file mode 100644
index 000000000000..4b70dba02d96
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsReader;
+import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsWriter;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Similar to {@link Lucene90PostingsFormat} but with a different term dictionary implementation.
+ *
+ * @lucene.experimental
+ */
+public final class Lucene90RandomAcessDictionaryPostingsFormat extends PostingsFormat {
+
+  // Increment version to change it
+  static final int VERSION_START = 0;
+  static final int VERSION_CURRENT = VERSION_START;
+
+  /** Creates {@code Lucene90RandomAcessDictionaryPostingsFormat} */
+  public Lucene90RandomAcessDictionaryPostingsFormat() {
+    super("Lucene90RandomAccess");
+  }
+
+  @Override
+  public String toString() {
+    return getName();
+  }
+
+  @Override
+  public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+    PostingsWriterBase postingsWriter = new Lucene90PostingsWriter(state);
+    boolean success = false;
+    try {
+      FieldsConsumer ret = new Lucene90RandomAccessTermsWriter();
+      success = true;
+      return ret;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(postingsWriter);
+      }
+    }
+  }
+
+  @Override
+  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+    PostingsReaderBase postingsReader = new Lucene90PostingsReader(state);
+    boolean success = false;
+    try {
+      FieldsProducer ret = new Lucene90RandomAccessTermsReader();
+      success = true;
+      return ret;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(postingsReader);
+      }
+    }
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
new file mode 100644
index 000000000000..7294f1ea09a9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90.radomaccess;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.index.Terms;
+
+public class Lucene90RandomAccessTermsReader extends FieldsProducer {
+  @Override
+  public void close() throws IOException {}
+
+  @Override
+  public void checkIntegrity() throws IOException {}
+
+  @Override
+  public Iterator<String> iterator() {
+    return null;
+  }
+
+  @Override
+  public Terms terms(String field) throws IOException {
+    return null;
+  }
+
+  @Override
+  public int size() {
+    return 0;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java
new file mode 100644
index 000000000000..4fe720907b38
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.radomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.index.Fields;
+
+public class Lucene90RandomAccessTermsWriter extends FieldsConsumer {
+  @Override
+  public void write(Fields fields, NormsProducer norms) throws IOException {}
+
+  @Override
+  public void close() throws IOException {}
+}

From 88afec30606c13616f2377a7201b7dce125af1c8 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 16 Oct 2023 14:10:05 -0700
Subject: [PATCH 02/57] Rename Lucene90RandomAcessDictionaryPostingsFormat to
 Lucene90RandomAccessDictionaryPostingsFormat

---
 .../Lucene90RandomAccessDictionaryPostingsFormat.java}       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
 rename lucene/core/src/java/org/apache/lucene/codecs/lucene90/{Lucene90RandomAcessDictionaryPostingsFormat.java => radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java} (91%)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
similarity index 91%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java
rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
index 4b70dba02d96..2918fb8d367d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90RandomAcessDictionaryPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene90;
+package org.apache.lucene.codecs.lucene90.radomaccess;
 
 import java.io.IOException;
 import org.apache.lucene.codecs.FieldsConsumer;
@@ -22,6 +22,9 @@
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsReader;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter;
 import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsReader;
 import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsWriter;
 import org.apache.lucene.index.SegmentReadState;

From d16c5012db5353c42741b4f562c402ed37d2f6c7 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 16 Oct 2023 14:16:11 -0700
Subject: [PATCH 03/57] restrict class visibility

---
 .../Lucene90RandomAccessDictionaryPostingsFormat.java     | 8 +++-----
 .../radomaccess/Lucene90RandomAccessTermsReader.java      | 2 +-
 .../radomaccess/Lucene90RandomAccessTermsWriter.java      | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
index 2918fb8d367d..7d770ceb7f26 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
@@ -25,8 +25,6 @@
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsReader;
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter;
-import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsReader;
-import org.apache.lucene.codecs.lucene90.radomaccess.Lucene90RandomAccessTermsWriter;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.util.IOUtils;
@@ -36,14 +34,14 @@
  *
  * @lucene.experimental
  */
-public final class Lucene90RandomAcessDictionaryPostingsFormat extends PostingsFormat {
+public final class Lucene90RandomAccessDictionaryPostingsFormat extends PostingsFormat {
 
   // Increment version to change it
   static final int VERSION_START = 0;
   static final int VERSION_CURRENT = VERSION_START;
 
-  /** Creates {@code Lucene90RandomAcessDictionaryPostingsFormat} */
-  public Lucene90RandomAcessDictionaryPostingsFormat() {
+  /** Creates {@code Lucene90RandomAccessDictionaryPostingsFormat} */
+  public Lucene90RandomAccessDictionaryPostingsFormat() {
     super("Lucene90RandomAccess");
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
index 7294f1ea09a9..7fa08663baba 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
@@ -22,7 +22,7 @@
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.index.Terms;
 
-public class Lucene90RandomAccessTermsReader extends FieldsProducer {
+class Lucene90RandomAccessTermsReader extends FieldsProducer {
   @Override
   public void close() throws IOException {}
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java
index 4fe720907b38..19bdf35845ee 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java
@@ -21,7 +21,7 @@
 import org.apache.lucene.codecs.NormsProducer;
 import org.apache.lucene.index.Fields;
 
-public class Lucene90RandomAccessTermsWriter extends FieldsConsumer {
+class Lucene90RandomAccessTermsWriter extends FieldsConsumer {
   @Override
   public void write(Fields fields, NormsProducer norms) throws IOException {}
 

From 3299fe065a50ae876bbb1e4dcdcaceabdbb27ac6 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 16 Oct 2023 15:08:07 -0700
Subject: [PATCH 04/57] Support per-type term index based on FST<Long>

---
 .../codecs/lucene90/radomaccess/TermType.java | 88 +++++++++++++++++++
 .../lucene90/radomaccess/TermsIndex.java      | 24 +++++
 .../radomaccess/TermsIndexBuilder.java        | 66 ++++++++++++++
 .../radomaccess/TestTermsIndexBuilder.java    | 77 ++++++++++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java
 create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java
 create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java
 create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java
new file mode 100644
index 000000000000..7dae21a94dd5
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90.radomaccess;
+
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+
+import java.util.Objects;
+
+class TermType {
+    private static final byte SINGLETON_DOC_MASK = (byte) 1;
+
+    private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1;
+
+    private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2;
+
+    public static final int NUM_TOTAL_TYPES = 8;
+
+    private final byte flag;
+
+    private TermType(byte flag) {
+        this.flag = flag;
+    }
+
+    int getId() {
+        assert this.flag >= 0 && this.flag <=8;
+        return this.flag;
+    }
+
+    boolean hasSingletonDoc() {
+        return (this.flag & SINGLETON_DOC_MASK) > 0;
+    }
+
+    boolean hasSkipData() {
+        return (this.flag & HAS_SKIP_DATA_MASK) > 0;
+    }
+
+    boolean hasVintPositionBlock() {
+        return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0;
+    }
+
+
+    static TermType fromTermState(IntBlockTermState state) {
+        byte flag = 0;
+        if (state.singletonDocID != -1) {
+            flag |= SINGLETON_DOC_MASK;
+        }
+        if (state.skipOffset != -1) {
+            flag |= HAS_SKIP_DATA_MASK;
+        }
+        if (state.lastPosBlockOffset != -1) {
+            flag |= HAS_VINT_POSITION_BLOCK_MASK;
+        }
+        return new TermType(flag);
+    }
+
+    static TermType fromId(int id) {
+        if (id < 0 || id > 8) {
+            throw new IllegalArgumentException("id must be within range [0, 8]");
+        }
+        return new TermType((byte) id);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(this.flag);
+    }
+
+    @Override
+    public boolean equals(Object that) {
+        return that instanceof TermType &&
+                ((TermType) that).flag == this.flag;
+    }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java
new file mode 100644
index 000000000000..27de75bf10d6
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90.radomaccess;
+
+import org.apache.lucene.util.fst.FST;
+
+
+record TermsIndex(FST<Long> fst) {
+}
\ No newline at end of file
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java
new file mode 100644
index 000000000000..ce7fe87207ce
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90.radomaccess;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Builds a term index for a given field. Logically this is a map: term -> (type, ord) where the ordinals
+ * are scoped to type (not global).
+ */
+final class TermsIndexBuilder {
+    private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
+    private final FSTCompiler<Long> fstCompiler =
+            new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
+
+    TermsIndexBuilder() {
+        Arrays.fill(countPerType, -1);
+    }
+
+    public void addTerm(BytesRef term, TermType termType) throws IOException {
+        IntsRefBuilder scratchInts = new IntsRefBuilder();
+        long ord = ++countPerType[termType.getId()];
+        fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType));
+    }
+
+    public TermsIndex build() throws IOException {
+        return new TermsIndex(fstCompiler.compile());
+    }
+
+    private long encode(long ord, TermType termType) {
+        // use a single long to encode `ord` and `termType`
+        // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0`
+        // so it looks like this |...  ord ...| termType| ... hasOutput  ...|
+        // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord
+        if ( ord < 0) {
+            throw new IllegalArgumentException("can't encode negative ord");
+        }
+        if ( ord > ((1L << 60) - 1) ) {
+            throw new IllegalArgumentException("Input ord is too large");
+        }
+        return (ord << 4) | ((long) termType.getId() << 1) | 1L;
+    }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java
new file mode 100644
index 000000000000..b8fd67ac64cc
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90.radomaccess;
+
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Util;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+public class TestTermsIndexBuilder extends LuceneTestCase {
+
+    public void testBasics() throws IOException {
+        String[] test_terms = {
+                "a",
+                "b",
+                "c",
+                "d",
+                "e",
+                "f",
+                "g",
+                "h",
+                "i",
+                "j",
+        };
+
+        Map<String, Integer> termsToType = new HashMap<>();
+        Map<String, Integer> termsToOrd = new HashMap<>();
+        Map<Integer, Integer> typeCounters = new HashMap<>();
+
+        for (String term : test_terms) {
+            int termType = random().nextInt(TermType.NUM_TOTAL_TYPES);
+            termsToType.put(term, termType);
+            int ord = typeCounters.getOrDefault(termType, -1) + 1;
+            typeCounters.put(termType, ord);
+            termsToOrd.put(term, ord);
+        }
+
+        TermsIndexBuilder builder = new TermsIndexBuilder();
+        for (String term : test_terms) {
+            BytesRef termBytes = new BytesRef(term);
+            builder.addTerm(termBytes, TermType.fromId(termsToType.get(term)));
+        }
+        TermsIndex termsIndex = builder.build();
+
+        FST<Long> fst = termsIndex.fst();
+
+        for (String term : test_terms) {
+            BytesRef termBytes = new BytesRef(term);
+            long encoded = Util.get(fst, termBytes);
+
+            assertEquals(1L, encoded & 0b1L);
+            assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1);
+            assertEquals((long) termsToOrd.get(term), encoded >> 4);
+        }
+
+    }
+
+}
\ No newline at end of file

From 137d5d367df46f33e076c918e3296216ffbc49f7 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 26 Oct 2023 14:03:45 -0700
Subject: [PATCH 05/57] Move the code to be under sandbox

---
 lucene/core/src/java/module-info.java         |  1 -
 .../codecs/lucene90/radomaccess/TermType.java | 88 ------------------
 .../radomaccess/TermsIndexBuilder.java        | 66 --------------
 .../radomaccess/TestTermsIndexBuilder.java    | 77 ----------------
 lucene/sandbox/src/java/module-info.java      |  1 +
 ...0RandomAccessDictionaryPostingsFormat.java |  2 +-
 .../Lucene90RandomAccessTermsReader.java      |  2 +-
 .../Lucene90RandomAccessTermsWriter.java      |  2 +-
 .../lucene90/randomaccess/TermType.java       | 91 +++++++++++++++++++
 .../lucene90/randomaccess}/TermsIndex.java    |  6 +-
 .../randomaccess/TermsIndexBuilder.java       | 70 ++++++++++++++
 .../lucene90/randomaccess/package-info.java   | 22 +++++
 .../randomaccess/TestTermsIndexBuilder.java   | 65 +++++++++++++
 13 files changed, 254 insertions(+), 239 deletions(-)
 delete mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java
 delete mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java
 delete mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java
 rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/Lucene90RandomAccessDictionaryPostingsFormat.java (97%)
 rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/Lucene90RandomAccessTermsReader.java (95%)
 rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/Lucene90RandomAccessTermsWriter.java (94%)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
 rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/radomaccess => sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess}/TermsIndex.java (89%)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java

diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java
index 3b1f27ff6160..c728be820999 100644
--- a/lucene/core/src/java/module-info.java
+++ b/lucene/core/src/java/module-info.java
@@ -35,7 +35,6 @@
   exports org.apache.lucene.codecs.lucene95;
   exports org.apache.lucene.codecs.lucene90.blocktree;
   exports org.apache.lucene.codecs.lucene90.compressing;
-  exports org.apache.lucene.codecs.lucene90.radomaccess;
   exports org.apache.lucene.codecs.perfield;
   exports org.apache.lucene.document;
   exports org.apache.lucene.geo;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java
deleted file mode 100644
index 7dae21a94dd5..000000000000
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermType.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.codecs.lucene90.radomaccess;
-
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
-
-import java.util.Objects;
-
-class TermType {
-    private static final byte SINGLETON_DOC_MASK = (byte) 1;
-
-    private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1;
-
-    private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2;
-
-    public static final int NUM_TOTAL_TYPES = 8;
-
-    private final byte flag;
-
-    private TermType(byte flag) {
-        this.flag = flag;
-    }
-
-    int getId() {
-        assert this.flag >= 0 && this.flag <=8;
-        return this.flag;
-    }
-
-    boolean hasSingletonDoc() {
-        return (this.flag & SINGLETON_DOC_MASK) > 0;
-    }
-
-    boolean hasSkipData() {
-        return (this.flag & HAS_SKIP_DATA_MASK) > 0;
-    }
-
-    boolean hasVintPositionBlock() {
-        return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0;
-    }
-
-
-    static TermType fromTermState(IntBlockTermState state) {
-        byte flag = 0;
-        if (state.singletonDocID != -1) {
-            flag |= SINGLETON_DOC_MASK;
-        }
-        if (state.skipOffset != -1) {
-            flag |= HAS_SKIP_DATA_MASK;
-        }
-        if (state.lastPosBlockOffset != -1) {
-            flag |= HAS_VINT_POSITION_BLOCK_MASK;
-        }
-        return new TermType(flag);
-    }
-
-    static TermType fromId(int id) {
-        if (id < 0 || id > 8) {
-            throw new IllegalArgumentException("id must be within range [0, 8]");
-        }
-        return new TermType((byte) id);
-    }
-
-    @Override
-    public int hashCode() {
-        return Objects.hashCode(this.flag);
-    }
-
-    @Override
-    public boolean equals(Object that) {
-        return that instanceof TermType &&
-                ((TermType) that).flag == this.flag;
-    }
-}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java
deleted file mode 100644
index ce7fe87207ce..000000000000
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndexBuilder.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.codecs.lucene90.radomaccess;
-
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.FSTCompiler;
-import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.fst.Util;
-
-import java.io.IOException;
-import java.util.Arrays;
-
-/**
- * Builds a term index for a given field. Logically this is a map: term -> (type, ord) where the ordinals
- * are scoped to type (not global).
- */
-final class TermsIndexBuilder {
-    private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
-    private final FSTCompiler<Long> fstCompiler =
-            new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
-
-    TermsIndexBuilder() {
-        Arrays.fill(countPerType, -1);
-    }
-
-    public void addTerm(BytesRef term, TermType termType) throws IOException {
-        IntsRefBuilder scratchInts = new IntsRefBuilder();
-        long ord = ++countPerType[termType.getId()];
-        fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType));
-    }
-
-    public TermsIndex build() throws IOException {
-        return new TermsIndex(fstCompiler.compile());
-    }
-
-    private long encode(long ord, TermType termType) {
-        // use a single long to encode `ord` and `termType`
-        // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0`
-        // so it looks like this |...  ord ...| termType| ... hasOutput  ...|
-        // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord
-        if ( ord < 0) {
-            throw new IllegalArgumentException("can't encode negative ord");
-        }
-        if ( ord > ((1L << 60) - 1) ) {
-            throw new IllegalArgumentException("Input ord is too large");
-        }
-        return (ord << 4) | ((long) termType.getId() << 1) | 1L;
-    }
-}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java
deleted file mode 100644
index b8fd67ac64cc..000000000000
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/radomaccess/TestTermsIndexBuilder.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.codecs.lucene90.radomaccess;
-
-import org.apache.lucene.tests.util.LuceneTestCase;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.Util;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-public class TestTermsIndexBuilder extends LuceneTestCase {
-
-    public void testBasics() throws IOException {
-        String[] test_terms = {
-                "a",
-                "b",
-                "c",
-                "d",
-                "e",
-                "f",
-                "g",
-                "h",
-                "i",
-                "j",
-        };
-
-        Map<String, Integer> termsToType = new HashMap<>();
-        Map<String, Integer> termsToOrd = new HashMap<>();
-        Map<Integer, Integer> typeCounters = new HashMap<>();
-
-        for (String term : test_terms) {
-            int termType = random().nextInt(TermType.NUM_TOTAL_TYPES);
-            termsToType.put(term, termType);
-            int ord = typeCounters.getOrDefault(termType, -1) + 1;
-            typeCounters.put(termType, ord);
-            termsToOrd.put(term, ord);
-        }
-
-        TermsIndexBuilder builder = new TermsIndexBuilder();
-        for (String term : test_terms) {
-            BytesRef termBytes = new BytesRef(term);
-            builder.addTerm(termBytes, TermType.fromId(termsToType.get(term)));
-        }
-        TermsIndex termsIndex = builder.build();
-
-        FST<Long> fst = termsIndex.fst();
-
-        for (String term : test_terms) {
-            BytesRef termBytes = new BytesRef(term);
-            long encoded = Util.get(fst, termBytes);
-
-            assertEquals(1L, encoded & 0b1L);
-            assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1);
-            assertEquals((long) termsToOrd.get(term), encoded >> 4);
-        }
-
-    }
-
-}
\ No newline at end of file
diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java
index c51a25691ef2..96522ed7a5f7 100644
--- a/lucene/sandbox/src/java/module-info.java
+++ b/lucene/sandbox/src/java/module-info.java
@@ -22,6 +22,7 @@
 
   exports org.apache.lucene.payloads;
   exports org.apache.lucene.sandbox.codecs.idversion;
+  exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
   exports org.apache.lucene.sandbox.document;
   exports org.apache.lucene.sandbox.queries;
   exports org.apache.lucene.sandbox.search;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
similarity index 97%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
index 7d770ceb7f26..60c292706a30 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene90.radomaccess;
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
 
 import java.io.IOException;
 import org.apache.lucene.codecs.FieldsConsumer;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java
similarity index 95%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java
index 7fa08663baba..d5214561bf26 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsReader.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.codecs.lucene90.radomaccess;
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
 
 import java.io.IOException;
 import java.util.Iterator;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java
similarity index 94%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java
index 19bdf35845ee..c18a0cbbd143 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/Lucene90RandomAccessTermsWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene90.radomaccess;
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
 
 import java.io.IOException;
 import org.apache.lucene.codecs.FieldsConsumer;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
new file mode 100644
index 000000000000..d52cace8545d
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.util.Objects;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+
+/**
+ * TermType holds the classification of a term, based on how its postings are written.
+ *
+ * <p>It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2)
+ * if the term has skip data. 3) if the term as an VINT encoded position block.
+ */
+final class TermType {
+  private static final byte SINGLETON_DOC_MASK = (byte) 1;
+
+  private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1;
+
+  private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2;
+
+  public static final int NUM_TOTAL_TYPES = 8;
+
+  private final byte flag;
+
+  private TermType(byte flag) {
+    this.flag = flag;
+  }
+
+  int getId() {
+    assert this.flag >= 0 && this.flag <= 8;
+    return this.flag;
+  }
+
+  boolean hasSingletonDoc() {
+    return (this.flag & SINGLETON_DOC_MASK) > 0;
+  }
+
+  boolean hasSkipData() {
+    return (this.flag & HAS_SKIP_DATA_MASK) > 0;
+  }
+
+  boolean hasVintPositionBlock() {
+    return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0;
+  }
+
+  static TermType fromTermState(IntBlockTermState state) {
+    byte flag = 0;
+    if (state.singletonDocID != -1) {
+      flag |= SINGLETON_DOC_MASK;
+    }
+    if (state.skipOffset != -1) {
+      flag |= HAS_SKIP_DATA_MASK;
+    }
+    if (state.lastPosBlockOffset != -1) {
+      flag |= HAS_VINT_POSITION_BLOCK_MASK;
+    }
+    return new TermType(flag);
+  }
+
+  static TermType fromId(int id) {
+    if (id < 0 || id > 8) {
+      throw new IllegalArgumentException("id must be within range [0, 8]");
+    }
+    return new TermType((byte) id);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hashCode(this.flag);
+  }
+
+  @Override
+  public boolean equals(Object that) {
+    return that instanceof TermType && ((TermType) that).flag == this.flag;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java
similarity index 89%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java
index 27de75bf10d6..94fce6559bc4 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/radomaccess/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java
@@ -15,10 +15,8 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.codecs.lucene90.radomaccess;
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
 
 import org.apache.lucene.util.fst.FST;
 
-
-record TermsIndex(FST<Long> fst) {
-}
\ No newline at end of file
+record TermsIndex(FST<Long> fst) {}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
new file mode 100644
index 000000000000..8077de7682ce
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FSTCompiler;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+/**
+ * Builds a term index for a given field. Logically this is a map: term -> (type, ord) where the
+ * ordinals are scoped to type (not global).
+ */
+final class TermsIndexBuilder {
+  private static long MAX_ORD = (1L << 60) - 1;
+
+  private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
+  private final FSTCompiler<Long> fstCompiler =
+      new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
+
+  TermsIndexBuilder() {
+    Arrays.fill(countPerType, -1);
+  }
+
+  public void addTerm(BytesRef term, TermType termType) throws IOException {
+    IntsRefBuilder scratchInts = new IntsRefBuilder();
+    long ord = ++countPerType[termType.getId()];
+    fstCompiler.add(Util.toIntsRef(term, scratchInts), encode(ord, termType));
+  }
+
+  public TermsIndex build() throws IOException {
+    return new TermsIndex(fstCompiler.compile());
+  }
+
+  private long encode(long ord, TermType termType) {
+    // use a single long to encode `ord` and `termType`
+    // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0`
+    // so it looks like this |...  ord ...| termType| ... hasOutput  ...|
+    // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord
+    if (ord < 0) {
+      throw new IllegalArgumentException("can't encode negative ord");
+    }
+    if (ord > MAX_ORD) {
+      throw new IllegalArgumentException(
+          "Input ord is too large for TermType: "
+              + termType.getId()
+              + ", max ord allowed is 2^60 - 1");
+    }
+    return (ord << 4) | ((long) termType.getId() << 1) | 1L;
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java
new file mode 100644
index 000000000000..d5cf9583f91c
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A PostingFormat that is based on {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat}
+ * but provides random access term dictionary.
+ */
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java
new file mode 100644
index 000000000000..43f4010b1ae6
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Util;
+
+public class TestTermsIndexBuilder extends LuceneTestCase {
+
+  public void testBasics() throws IOException {
+    String[] test_terms = {
+      "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+    };
+
+    Map<String, Integer> termsToType = new HashMap<>();
+    Map<String, Integer> termsToOrd = new HashMap<>();
+    Map<Integer, Integer> typeCounters = new HashMap<>();
+
+    for (String term : test_terms) {
+      int termType = random().nextInt(TermType.NUM_TOTAL_TYPES);
+      termsToType.put(term, termType);
+      int ord = typeCounters.getOrDefault(termType, -1) + 1;
+      typeCounters.put(termType, ord);
+      termsToOrd.put(term, ord);
+    }
+
+    TermsIndexBuilder builder = new TermsIndexBuilder();
+    for (String term : test_terms) {
+      BytesRef termBytes = new BytesRef(term);
+      builder.addTerm(termBytes, TermType.fromId(termsToType.get(term)));
+    }
+    TermsIndex termsIndex = builder.build();
+
+    FST<Long> fst = termsIndex.fst();
+
+    for (String term : test_terms) {
+      BytesRef termBytes = new BytesRef(term);
+      long encoded = Util.get(fst, termBytes);
+
+      assertEquals(1L, encoded & 0b1L);
+      assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1);
+      assertEquals((long) termsToOrd.get(term), encoded >> 4);
+    }
+  }
+}

From b758ec57a1ffdd7d3519553a672e1d8ebe270956 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 26 Oct 2023 16:12:10 -0700
Subject: [PATCH 06/57] Add interfaces for encoding/decoding TermStates

motivation: We will need to deal with encoding `IntBlockTermState` for
different type of terms. Instead of having dedicated class for each term type,
which would be 8 types in total, we can spell out the individual components of
`IntBlockTermState`. Then implement a codec which works with the composition of
the components. This way we can have a single implementation of the codec and
construct the composition (really just array of components) per term type.
---
 .../lucene90/randomaccess/TermStateCodec.java |  39 ++++
 .../randomaccess/TermStateCodecComponent.java | 182 ++++++++++++++++++
 .../randomaccess/TermStateCodecImpl.java      |  40 ++++
 .../randomaccess/bitpacking/BitPacker.java    |  23 +++
 .../randomaccess/bitpacking/BitUnpacker.java  |  23 +++
 .../randomaccess/bitpacking/package-info.java |  19 ++
 .../TestTermStateCodecComponent.java          |  43 +++++
 7 files changed, 369 insertions(+)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
new file mode 100644
index 000000000000..38a024c8e2b6
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.util.BytesRef;
+
+interface TermStateCodec {
+
+  /**
+   * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker
+   *
+   * @return the metadata associated with the encoded bytes
+   */
+  byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker);
+
+  /**
+   * Decode out a {@link IntBlockTermState} with provided metadata bye slice and data byte slice
+   *
+   * @return the decoded term state
+   */
+  IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes);
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
new file mode 100644
index 000000000000..97ef6c9ecaeb
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+
+interface TermStateCodecComponent {
+
+  static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) {
+    assert termStates.length > 0;
+
+    long maxValSeen = -1;
+    for (var termState : termStates) {
+      maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState));
+    }
+    return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen));
+  }
+
+  boolean isMonotonicallyIncreasing();
+
+  long getTargetValue(IntBlockTermState termState);
+
+  void setTargetValue(IntBlockTermState termState, long value);
+
+  final class SingletonDocId implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.singletonDocID;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      assert value <= Integer.MAX_VALUE;
+      // A correct codec implementation does not change the value,
+      // after the encode/decode round-trip it should still be a valid int
+      termState.singletonDocID = (int) value;
+    }
+  }
+
+  /** Below are the relevant IntBlockTermState components * */
+  final class DocFreq implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.docFreq;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      assert value <= Integer.MAX_VALUE;
+      // A correct codec implementation does not change the value,
+      // after the encode/decode round-trip it should still be a valid int
+      termState.docFreq = (int) value;
+    }
+  }
+
+  final class TotalTermFreq implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.totalTermFreq;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.totalTermFreq = value;
+    }
+  }
+
+  final class DocStartFP implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return true;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.docStartFP;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.docStartFP = value;
+    }
+  }
+
+  final class PositionStartFP implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return true;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.posStartFP;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.posStartFP = value;
+    }
+  }
+
+  final class PayloadStartFP implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return true;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.payStartFP;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.payStartFP = value;
+    }
+  }
+
+  final class SkipOffset implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.skipOffset;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.skipOffset = value;
+    }
+  }
+
+  final class LastPositionBlockOffset implements TermStateCodecComponent {
+    @Override
+    public boolean isMonotonicallyIncreasing() {
+      return false;
+    }
+
+    @Override
+    public long getTargetValue(IntBlockTermState termState) {
+      return termState.lastPosBlockOffset;
+    }
+
+    @Override
+    public void setTargetValue(IntBlockTermState termState, long value) {
+      termState.lastPosBlockOffset = value;
+    }
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
new file mode 100644
index 000000000000..4481cb31e613
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.util.BytesRef;
+
+final class TermStateCodecImpl implements TermStateCodec {
+  private final TermStateCodecComponent[] components;
+
+  public TermStateCodecImpl(TermStateCodecComponent[] components) {
+    this.components = components;
+  }
+
+  @Override
+  public byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker) {
+    return new byte[0];
+  }
+
+  @Override
+  public IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes) {
+    return null;
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
new file mode 100644
index 000000000000..3841278b7840
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+
+public interface BitPacker {
+
+  void add(long value, int numBits);
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
new file mode 100644
index 000000000000..8a0bd580dd91
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+
+public interface BitUnpacker {
+
+  long unpack(byte[] data, int startBitIndex, int bitWidth);
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java
new file mode 100644
index 000000000000..866d071788ac
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Code for packing and unpacking sequence of non-negative integers with smaller bit width. */
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
new file mode 100644
index 000000000000..2f3457bdfdff
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+public class TestTermStateCodecComponent extends LuceneTestCase {
+
+  public void testGetBitWidth() {
+    int expectedMaxBits = random().nextInt(31) + 1;
+    int bitMask = 0xFFFFFFFF >>> (32 - expectedMaxBits);
+    IntBlockTermState[] termStates =
+        random()
+            .ints()
+            .limit(100)
+            .mapToObj(
+                docFreq -> {
+                  var x = new IntBlockTermState();
+                  x.docFreq = docFreq & bitMask;
+                  return x;
+                })
+            .toArray(IntBlockTermState[]::new);
+    byte bitWidth =
+        TermStateCodecComponent.getBitWidth(termStates, new TermStateCodecComponent.DocFreq());
+    assertTrue(bitWidth <= expectedMaxBits);
+  }
+}

From 7d35ed239c2f27223b562390dcce2097efc5116d Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Fri, 27 Oct 2023 12:10:58 -0700
Subject: [PATCH 07/57] Make the concrete TermStateCodecComponents singletons

---
 .../randomaccess/TermStateCodecComponent.java | 58 ++++++++++++++-----
 .../TestTermStateCodecComponent.java          |  2 +-
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
index 97ef6c9ecaeb..da1ee77b7ece 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
@@ -19,7 +19,7 @@
 
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
 
-interface TermStateCodecComponent {
+abstract class TermStateCodecComponent {
 
   static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) {
     assert termStates.length > 0;
@@ -31,13 +31,18 @@ static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent
     return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen));
   }
 
-  boolean isMonotonicallyIncreasing();
+  abstract boolean isMonotonicallyIncreasing();
 
-  long getTargetValue(IntBlockTermState termState);
+  abstract long getTargetValue(IntBlockTermState termState);
 
-  void setTargetValue(IntBlockTermState termState, long value);
+  abstract void setTargetValue(IntBlockTermState termState, long value);
+
+  /** Below are the relevant IntBlockTermState components * */
+  static final class SingletonDocId extends TermStateCodecComponent {
+    public static SingletonDocId INSTANCE = new SingletonDocId();
+
+    private SingletonDocId() {}
 
-  final class SingletonDocId implements TermStateCodecComponent {
     @Override
     public boolean isMonotonicallyIncreasing() {
       return false;
@@ -57,8 +62,11 @@ public void setTargetValue(IntBlockTermState termState, long value) {
     }
   }
 
-  /** Below are the relevant IntBlockTermState components * */
-  final class DocFreq implements TermStateCodecComponent {
+  static final class DocFreq extends TermStateCodecComponent {
+    public static DocFreq INSTANCE = new DocFreq();
+
+    private DocFreq() {}
+
     @Override
     public boolean isMonotonicallyIncreasing() {
       return false;
@@ -78,7 +86,11 @@ public void setTargetValue(IntBlockTermState termState, long value) {
     }
   }
 
-  final class TotalTermFreq implements TermStateCodecComponent {
+  static final class TotalTermFreq extends TermStateCodecComponent {
+    public static TotalTermFreq INSTANCE = new TotalTermFreq();
+
+    private TotalTermFreq() {}
+
     @Override
     public boolean isMonotonicallyIncreasing() {
       return false;
@@ -95,7 +107,11 @@ public void setTargetValue(IntBlockTermState termState, long value) {
     }
   }
 
-  final class DocStartFP implements TermStateCodecComponent {
+  static final class DocStartFP extends TermStateCodecComponent {
+    public static DocStartFP INSTANCE = new DocStartFP();
+
+    private DocStartFP() {}
+
     @Override
     public boolean isMonotonicallyIncreasing() {
       return true;
@@ -112,7 +128,11 @@ public void setTargetValue(IntBlockTermState termState, long value) {
     }
   }
 
-  final class PositionStartFP implements TermStateCodecComponent {
+  static final class PositionStartFP extends TermStateCodecComponent {
+    public static PositionStartFP INSTANCE = new PositionStartFP();
+
+    private PositionStartFP() {}
+
     @Override
     public boolean isMonotonicallyIncreasing() {
       return true;
@@ -129,7 +149,11 @@ public void setTargetValue(IntBlockTermState termState, long value) {
     }
   }
 
-  final class PayloadStartFP implements TermStateCodecComponent {
+  static final class PayloadStartFP extends TermStateCodecComponent {
+    public static PayloadStartFP INSTANCE = new PayloadStartFP();
+
+    private PayloadStartFP() {}
+
     @Override
     public boolean isMonotonicallyIncreasing() {
       return true;
@@ -146,7 +170,11 @@ public void setTargetValue(IntBlockTermState termState, long value) {
     }
   }
 
-  final class SkipOffset implements TermStateCodecComponent {
+  static final class SkipOffset extends TermStateCodecComponent {
+    public static SkipOffset INSTANCE = new SkipOffset();
+
+    private SkipOffset() {}
+
     @Override
     public boolean isMonotonicallyIncreasing() {
       return false;
@@ -163,7 +191,11 @@ public void setTargetValue(IntBlockTermState termState, long value) {
     }
   }
 
-  final class LastPositionBlockOffset implements TermStateCodecComponent {
+  static final class LastPositionBlockOffset extends TermStateCodecComponent {
+    public static LastPositionBlockOffset INSTANCE = new LastPositionBlockOffset();
+
+    private LastPositionBlockOffset() {}
+
     @Override
     public boolean isMonotonicallyIncreasing() {
       return false;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
index 2f3457bdfdff..ab2bebf252a2 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
@@ -37,7 +37,7 @@ public void testGetBitWidth() {
                 })
             .toArray(IntBlockTermState[]::new);
     byte bitWidth =
-        TermStateCodecComponent.getBitWidth(termStates, new TermStateCodecComponent.DocFreq());
+        TermStateCodecComponent.getBitWidth(termStates, TermStateCodecComponent.DocFreq.INSTANCE);
     assertTrue(bitWidth <= expectedMaxBits);
   }
 }

From 6a1506b80668188ec08728ac5981bb64ffd2e267 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 30 Oct 2023 14:45:34 -0700
Subject: [PATCH 08/57] Fix the expected export module check

---
 lucene/sandbox/src/java/module-info.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java
index 96522ed7a5f7..59331969cce1 100644
--- a/lucene/sandbox/src/java/module-info.java
+++ b/lucene/sandbox/src/java/module-info.java
@@ -23,6 +23,7 @@
   exports org.apache.lucene.payloads;
   exports org.apache.lucene.sandbox.codecs.idversion;
   exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+  exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
   exports org.apache.lucene.sandbox.document;
   exports org.apache.lucene.sandbox.queries;
   exports org.apache.lucene.sandbox.search;

From e06f30362e8be8e8d750ceb4eccd0b9a82242161 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 30 Oct 2023 14:46:14 -0700
Subject: [PATCH 09/57] Implment TermStateCodecComponent.getBitWidth for
 monotonically increasing values

---
 .../randomaccess/TermStateCodecComponent.java |  5 ++-
 .../TestTermStateCodecComponent.java          | 39 +++++++++++++++++--
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
index da1ee77b7ece..9d93f40dc4b0 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
@@ -25,8 +25,11 @@ static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent
     assert termStates.length > 0;
 
     long maxValSeen = -1;
+    long referenceValue =
+        component.isMonotonicallyIncreasing() ? component.getTargetValue(termStates[0]) : 0;
+
     for (var termState : termStates) {
-      maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState));
+      maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState) - referenceValue);
     }
     return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen));
   }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
index ab2bebf252a2..862996fb6c30 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
@@ -17,6 +17,7 @@
 
 package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
 
+import java.util.stream.LongStream;
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
 import org.apache.lucene.tests.util.LuceneTestCase;
 
@@ -25,19 +26,49 @@ public class TestTermStateCodecComponent extends LuceneTestCase {
   public void testGetBitWidth() {
     int expectedMaxBits = random().nextInt(31) + 1;
     int bitMask = 0xFFFFFFFF >>> (32 - expectedMaxBits);
+    int highestBit = (bitMask >>> 1) + 1;
+
     IntBlockTermState[] termStates =
         random()
-            .ints()
-            .limit(100)
+            .ints(256)
             .mapToObj(
                 docFreq -> {
                   var x = new IntBlockTermState();
-                  x.docFreq = docFreq & bitMask;
+                  x.docFreq = (docFreq & bitMask) | highestBit;
                   return x;
                 })
             .toArray(IntBlockTermState[]::new);
+
     byte bitWidth =
         TermStateCodecComponent.getBitWidth(termStates, TermStateCodecComponent.DocFreq.INSTANCE);
-    assertTrue(bitWidth <= expectedMaxBits);
+    assertEquals(expectedMaxBits, bitWidth);
+  }
+
+  public void testGetBitWidthWithIncreasingValues() {
+    long baseValue = random().nextLong(Long.MAX_VALUE >> 1);
+    int expectedMaxBits = random().nextInt(63) + 1;
+    long bitMask = 0xFFFFFFFF_FFFFFFFFL >>> (64 - expectedMaxBits);
+    long highestBit = (bitMask >>> 1) + 1;
+
+    var randomLongs =
+        random()
+            .longs(256, 0, Long.MAX_VALUE - baseValue)
+            .map(x -> baseValue + ((x & bitMask) | highestBit))
+            .sorted();
+
+    IntBlockTermState[] termStates =
+        LongStream.concat(LongStream.of(baseValue), randomLongs)
+            .mapToObj(
+                docStartFP -> {
+                  var x = new IntBlockTermState();
+                  x.docStartFP = docStartFP;
+                  return x;
+                })
+            .toArray(IntBlockTermState[]::new);
+
+    byte bitWidth =
+        TermStateCodecComponent.getBitWidth(
+            termStates, TermStateCodecComponent.DocStartFP.INSTANCE);
+    assertEquals(expectedMaxBits, bitWidth);
   }
 }

From ea2c76f6ea1a7fa88ca83e0749584f4504974747 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Wed, 1 Nov 2023 21:13:53 -0700
Subject: [PATCH 10/57] Implement a codec (not Lucene Codec) for
 IntBlockTermState

TermStateCodecImpl implements TermStateCodec which supports encoding a block
of IntBlockTermState and decoding within that block at a given index.
---
 .../lucene90/randomaccess/TermStateCodec.java |  12 +-
 .../randomaccess/TermStateCodecImpl.java      | 116 ++++++++++++++++-
 .../randomaccess/bitpacking/BitUnpacker.java  |   4 +-
 .../randomaccess/TestTermStateCodecImpl.java  | 120 ++++++++++++++++++
 4 files changed, 245 insertions(+), 7 deletions(-)
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
index 38a024c8e2b6..7d1cb0dd6ae6 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
@@ -19,21 +19,27 @@
 
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
 import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
 import org.apache.lucene.util.BytesRef;
 
 interface TermStateCodec {
 
   /**
-   * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker
+   * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker into a block of
+   * bytes.
    *
    * @return the metadata associated with the encoded bytes
    */
   byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker);
 
   /**
-   * Decode out a {@link IntBlockTermState} with provided metadata bye slice and data byte slice
+   * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and
+   * data byte slice, at the given index within an encoded block.
+   *
+   * <p>Note: This method expects the dataBytes contains the bytes for the whole block.
    *
    * @return the decoded term state
    */
-  IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes);
+  IntBlockTermState decodeWithinBlock(
+      BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index);
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
index 4481cb31e613..5bd3730ed3fa 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
@@ -19,22 +19,132 @@
 
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
 import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.util.BytesRef;
 
 final class TermStateCodecImpl implements TermStateCodec {
   private final TermStateCodecComponent[] components;
+  private final int metadataBytesLength;
+
+  private static int getMetadataLength(TermStateCodecComponent component) {
+    // 1 byte for bitWidth; optionally 8 byte more for the reference value
+    return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0);
+  }
 
   public TermStateCodecImpl(TermStateCodecComponent[] components) {
+    assert components.length > 0;
+
     this.components = components;
+    int metadataBytesLength = 0;
+    for (var component : components) {
+      metadataBytesLength += getMetadataLength(component);
+    }
+    this.metadataBytesLength = metadataBytesLength;
   }
 
   @Override
   public byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker) {
-    return new byte[0];
+    Metadata[] metadataPerComponent = getMetadataPerComponent(inputs);
+    byte[] metadataBytes = serializeMetadata(metadataPerComponent);
+
+    // Encode inputs via the bitpacker
+    for (var termState : inputs) {
+      encodeOne(bitPacker, termState, metadataPerComponent);
+    }
+
+    return metadataBytes;
+  }
+
+  private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs) {
+    Metadata[] metadataPerComponent = new Metadata[components.length];
+    for (int i = 0; i < components.length; i++) {
+      var component = components[i];
+      byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, component);
+      long referenceValue =
+          component.isMonotonicallyIncreasing() ? component.getTargetValue(inputs[0]) : 0L;
+      metadataPerComponent[i] = new Metadata(bitWidth, referenceValue);
+    }
+    return metadataPerComponent;
+  }
+
+  private byte[] serializeMetadata(Metadata[] metadataPerComponent) {
+    byte[] metadataBytes = new byte[this.metadataBytesLength];
+    ByteArrayDataOutput dataOut = new ByteArrayDataOutput(metadataBytes);
+
+    for (int i = 0; i < components.length; i++) {
+      var metadata = metadataPerComponent[i];
+      dataOut.writeByte(metadata.bitWidth);
+      if (components[i].isMonotonicallyIncreasing()) {
+        dataOut.writeLong(metadata.referenceValue);
+      }
+    }
+    return metadataBytes;
+  }
+
+  private void encodeOne(
+      BitPacker bitPacker, IntBlockTermState termState, Metadata[] metadataPerComponent) {
+    for (int i = 0; i < components.length; i++) {
+      var component = components[i];
+      var metadata = metadataPerComponent[i];
+      long valToEncode = component.getTargetValue(termState) - metadata.referenceValue;
+      bitPacker.add(valToEncode, metadata.bitWidth);
+    }
   }
 
   @Override
-  public IntBlockTermState decode(BytesRef metadataBytes, BytesRef dataBytes) {
-    return null;
+  public IntBlockTermState decodeWithinBlock(
+      BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index) {
+    assert metadataBytes.length == this.metadataBytesLength;
+
+    var metadata = deserializedMetadata(metadataBytes);
+
+    int startBitIndex = index * metadata.totalBitsPerTermState;
+    return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent);
   }
+
+  private MetadataAndTotalBitsPerTermState deserializedMetadata(BytesRef metadataBytes) {
+    Metadata[] metadataPerComponent = new Metadata[components.length];
+    ByteArrayDataInput byteArrayDataInput =
+        new ByteArrayDataInput(metadataBytes.bytes, metadataBytes.offset, metadataBytes.length);
+    int totalBitsPerTermState = 0;
+    for (int i = 0; i < components.length; i++) {
+      var component = components[i];
+      byte bitWidth = byteArrayDataInput.readByte();
+      long referenceValue = -1;
+      if (component.isMonotonicallyIncreasing()) {
+        referenceValue = byteArrayDataInput.readLong();
+      }
+      metadataPerComponent[i] = new Metadata(bitWidth, referenceValue);
+
+      totalBitsPerTermState += bitWidth;
+    }
+
+    return new MetadataAndTotalBitsPerTermState(metadataPerComponent, totalBitsPerTermState);
+  }
+
+  private IntBlockTermState extract(
+      BytesRef dataBytes,
+      BitUnpacker bitUnpacker,
+      int startBitIndex,
+      Metadata[] metadataPerComponent) {
+    IntBlockTermState decoded = new IntBlockTermState();
+    for (int i = 0; i < components.length; i++) {
+      var component = components[i];
+      var metadata = metadataPerComponent[i];
+      long val = bitUnpacker.unpack(dataBytes, startBitIndex, metadata.bitWidth);
+      if (metadata.referenceValue > 0) {
+        val += metadata.referenceValue;
+      }
+      component.setTargetValue(decoded, val);
+      startBitIndex += metadata.bitWidth;
+    }
+    return decoded;
+  }
+
+  private record Metadata(byte bitWidth, long referenceValue) {}
+
+  private record MetadataAndTotalBitsPerTermState(
+      Metadata[] metadataPerComponent, int totalBitsPerTermState) {}
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
index 8a0bd580dd91..35fc1612790a 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
@@ -17,7 +17,9 @@
 
 package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
 
+import org.apache.lucene.util.BytesRef;
+
 public interface BitUnpacker {
 
-  long unpack(byte[] data, int startBitIndex, int bitWidth);
+  long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth);
 }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
new file mode 100644
index 000000000000..92e2700c74b2
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+
+import java.util.ArrayList;
+import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+
+public class TestTermStateCodecImpl extends LuceneTestCase {
+
+  public void testEncodeDecode() {
+    TermStateCodecImpl codec =
+        new TermStateCodecImpl(
+            new TermStateCodecComponent[] {
+              TermStateCodecComponent.DocFreq.INSTANCE, TermStateCodecComponent.DocStartFP.INSTANCE,
+            });
+
+    ArrayList<IntBlockTermState> termStates = new ArrayList<>();
+    long maxDocFreqSeen = -1;
+    long docStartFPBase = random().nextLong(Long.MAX_VALUE >> 1);
+    long maxDocStartFPDeltaSeen = -1;
+    for (int i = 0; i < random().nextInt(2, 256); i++) {
+      var termState = new IntBlockTermState();
+      termState.docFreq = random().nextInt(1, Integer.MAX_VALUE);
+      if (i == 0) {
+        termState.docStartFP = docStartFPBase;
+      } else {
+        termState.docStartFP = termStates.get(i - 1).docStartFP + random().nextLong(1024);
+        maxDocStartFPDeltaSeen =
+            Math.max(maxDocStartFPDeltaSeen, termState.docStartFP - docStartFPBase);
+      }
+      maxDocFreqSeen = Math.max(maxDocFreqSeen, termState.docFreq);
+      termStates.add(termState);
+    }
+
+    IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new);
+
+    BitPerBytePacker bitPerBytePacker = new BitPerBytePacker();
+    byte[] metadata = codec.encode(termStatesArray, bitPerBytePacker);
+
+    // For the metadata, we expect
+    // 0: DocFreq.bitWidth,
+    // 1: DocStartFP.bitWidth,
+    // [2-10]: DocStartFP.referenceValue;
+    assertEquals(10, metadata.length);
+    assertEquals(64 - Long.numberOfLeadingZeros(maxDocFreqSeen), metadata[0]);
+    assertEquals(64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen), metadata[1]);
+    ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8);
+    assertEquals(docStartFPBase, byteArrayDataInput.readLong());
+
+    // Assert that each term state is the same after the encode-decode roundtrip.
+    BytesRef metadataBytes = new BytesRef(metadata);
+    BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes());
+    for (int i = 0; i < termStatesArray.length; i++) {
+      IntBlockTermState decoded =
+          codec.decodeWithinBlock(metadataBytes, dataBytes, bitPerBytePacker, i);
+      assertEquals(termStatesArray[i].docFreq, decoded.docFreq);
+      assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP);
+    }
+  }
+}
+
+/**
+ * A wasteful bit packer that use whole byte to keep a bit. Useful for tests. It uses little-endian
+ * bit order.
+ */
+class BitPerBytePacker implements BitPacker, BitUnpacker {
+  private final ArrayList<Byte> buffer = new ArrayList<>();
+
+  private int totalNumBits = 0;
+
+  @Override
+  public void add(long value, int numBits) {
+    assert numBits < 64;
+    totalNumBits += numBits;
+    while (numBits-- > 0) {
+      byte b = (byte) (value & 1L);
+      value = value >>> 1;
+      buffer.add(b);
+    }
+  }
+
+  public byte[] getBytes() {
+    byte[] bytes = new byte[totalNumBits];
+    int index = 0;
+    for (var b : buffer) {
+      bytes[index++] = b;
+    }
+
+    return bytes;
+  }
+
+  @Override
+  public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) {
+    long res = 0;
+    for (int i = 0; i < bitWidth; i++) {
+      res |= ((long) (bytesRef.bytes[bytesRef.offset + startBitIndex + i] & 1)) << i;
+    }
+    return res;
+  }
+}

From c87713c47577ba435736e9b62e84ba58ce7ec024 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 2 Nov 2023 11:31:23 -0700
Subject: [PATCH 11/57] Add more javadoc and minor re-naming

---
 .../codecs/lucene90/randomaccess/TermStateCodec.java        | 6 ++++--
 .../codecs/lucene90/randomaccess/TermStateCodecImpl.java    | 2 +-
 .../codecs/lucene90/randomaccess/bitpacking/BitPacker.java  | 2 ++
 .../lucene90/randomaccess/bitpacking/BitUnpacker.java       | 2 ++
 .../lucene90/randomaccess/TestTermStateCodecImpl.java       | 2 +-
 5 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
index 7d1cb0dd6ae6..9b48c00cd54d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
@@ -30,13 +30,15 @@ interface TermStateCodec {
    *
    * @return the metadata associated with the encoded bytes
    */
-  byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker);
+  byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker);
 
   /**
    * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and
    * data byte slice, at the given index within an encoded block.
    *
-   * <p>Note: This method expects the dataBytes contains the bytes for the whole block.
+   * <p>Note: This method expects dataBytes that starts at the start of the block. Also, dataBytes
+   * should contain enough bytes (but not necessarily the whole block) to decode at the term state
+   * at `index`.
    *
    * @return the decoded term state
    */
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
index 5bd3730ed3fa..0e55b1235fa7 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
@@ -45,7 +45,7 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) {
   }
 
   @Override
-  public byte[] encode(IntBlockTermState[] inputs, BitPacker bitPacker) {
+  public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) {
     Metadata[] metadataPerComponent = getMetadataPerComponent(inputs);
     byte[] metadataBytes = serializeMetadata(metadataPerComponent);
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
index 3841278b7840..a1828c69a032 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
@@ -17,7 +17,9 @@
 
 package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
 
+/** Interface for bit-packing */
 public interface BitPacker {
 
+  /** Pack the low `numBits` bits of `value` */
   void add(long value, int numBits);
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
index 35fc1612790a..7c9448d893b5 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
@@ -19,7 +19,9 @@
 
 import org.apache.lucene.util.BytesRef;
 
+/** Interface for bit-unpacking */
 public interface BitUnpacker {
 
+  /** Unpack a long in the given bytesRef from a range of bits. */
   long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth);
 }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
index 92e2700c74b2..9d7bdff06b3f 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
@@ -55,7 +55,7 @@ public void testEncodeDecode() {
     IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new);
 
     BitPerBytePacker bitPerBytePacker = new BitPerBytePacker();
-    byte[] metadata = codec.encode(termStatesArray, bitPerBytePacker);
+    byte[] metadata = codec.encodeBlock(termStatesArray, bitPerBytePacker);
 
     // For the metadata, we expect
     // 0: DocFreq.bitWidth,

From 322a0f0eca201861c43b3c5cb48658ac7251cc94 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 2 Nov 2023 13:53:47 -0700
Subject: [PATCH 12/57] TestTermStateCodecImpl to decode at non-block starting
 positions

---
 .../lucene90/randomaccess/TermStateCodec.java | 20 +++++++++++++++++++
 .../randomaccess/TermStateCodecImpl.java      |  9 +++++++++
 .../randomaccess/TestTermStateCodecImpl.java  | 19 ++++++++++++++++--
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
index 9b48c00cd54d..a203bdc180e0 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
@@ -44,4 +44,24 @@ interface TermStateCodec {
    */
   IntBlockTermState decodeWithinBlock(
       BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index);
+
+  /**
+   * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and
+   * data byte slice, starting at `startBitIndex`.
+   *
+   * <p>Note: The dataBytes should contain enough bits to decode out the term state. passing more
+   * bytes than needed is fine but excessive ones are not used.
+   *
+   * <p>e.g. we want to decode a term state which contains value x, y and z, that has 18 bits in
+   * total. Assume x takes 4 bit, y takes 4 bit and z takes 10 bits.
+   *
+   * <p>Here is the visualization wh en we decode with startBitIndex=7
+   *
+   * <pre>
+   *     Note: little-endian bit order
+   *     [x.......][zyyyyxxx][zzzzzzzz][.......z]
+   * </pre>
+   */
+  IntBlockTermState decodeAt(
+      BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex);
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
index 0e55b1235fa7..eea9e1b149a8 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
@@ -104,6 +104,15 @@ public IntBlockTermState decodeWithinBlock(
     return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent);
   }
 
+  @Override
+  public IntBlockTermState decodeAt(
+      BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex) {
+    assert metadataBytes.length == this.metadataBytesLength;
+
+    var metadata = deserializedMetadata(metadataBytes);
+    return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent);
+  }
+
   private MetadataAndTotalBitsPerTermState deserializedMetadata(BytesRef metadataBytes) {
     Metadata[] metadataPerComponent = new Metadata[components.length];
     ByteArrayDataInput byteArrayDataInput =
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
index 9d7bdff06b3f..6be829d621ff 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
@@ -61,9 +61,11 @@ public void testEncodeDecode() {
     // 0: DocFreq.bitWidth,
     // 1: DocStartFP.bitWidth,
     // [2-10]: DocStartFP.referenceValue;
+    int expectedDocFreqBitWidth = 64 - Long.numberOfLeadingZeros(maxDocFreqSeen);
+    int expectedDocStartFPBitWidth = 64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen);
     assertEquals(10, metadata.length);
-    assertEquals(64 - Long.numberOfLeadingZeros(maxDocFreqSeen), metadata[0]);
-    assertEquals(64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen), metadata[1]);
+    assertEquals(expectedDocFreqBitWidth, metadata[0]);
+    assertEquals(expectedDocStartFPBitWidth, metadata[1]);
     ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8);
     assertEquals(docStartFPBase, byteArrayDataInput.readLong());
 
@@ -76,6 +78,19 @@ public void testEncodeDecode() {
       assertEquals(termStatesArray[i].docFreq, decoded.docFreq);
       assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP);
     }
+
+    // Also test decoding that doesn't begin at the start of the block.
+    int pos = random().nextInt(termStatesArray.length);
+    int startBitIndex = random().nextInt(pos);
+    dataBytes =
+        new BytesRef(
+            bitPerBytePacker.getBytes(),
+            pos * (expectedDocFreqBitWidth + expectedDocStartFPBitWidth) - startBitIndex,
+            expectedDocFreqBitWidth + expectedDocStartFPBitWidth);
+    IntBlockTermState decoded =
+        codec.decodeAt(metadataBytes, dataBytes, bitPerBytePacker, startBitIndex);
+    assertEquals(termStatesArray[pos].docFreq, decoded.docFreq);
+    assertEquals(termStatesArray[pos].docStartFP, decoded.docStartFP);
   }
 }
 

From 0976ce769cfc264f39a006f1e35071dcb9aa3cab Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 6 Nov 2023 12:15:47 -0800
Subject: [PATCH 13/57] Implement compact BitUnpacker

---
 .../bitpacking/BitUnpackerImpl.java           | 79 +++++++++++++++
 .../randomaccess/TestTermStateCodecImpl.java  | 99 +++++++++----------
 .../bitpacking/BitPerBytePacker.java          | 89 +++++++++++++++++
 .../bitpacking/TestBitUnpackerImpl.java       | 62 ++++++++++++
 4 files changed, 278 insertions(+), 51 deletions(-)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java
new file mode 100644
index 000000000000..b4cfd54f584a
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+
+import org.apache.lucene.util.BytesRef;
+
+public class BitUnpackerImpl implements BitUnpacker {
+  public static BitUnpackerImpl INSTANCE = new BitUnpackerImpl();
+
+  private BitUnpackerImpl() {}
+
+  @Override
+  public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) {
+    assert (startBitIndex + bitWidth) <= bytesRef.length * 8;
+    assert bitWidth < 64;
+
+    int firstByteIndex = startBitIndex / 8;
+    int numBitsToExcludeInFirstByte = startBitIndex % 8;
+    int lastByteIndex = (startBitIndex + bitWidth) / 8;
+    int numBitsToKeepInLastByte = (startBitIndex + bitWidth) % 8;
+
+    /*
+     *  idea: there are two cases
+     *  (1) when the requests bits are within the same byte; e.g. startBitIndex = 1, bitWidth = 5
+     *  (2) when the requests bits span across many bytes; e.g. startBitIndex = 1, bitWidth = 15
+     *  For (1) it is trivial,
+     *  for (2) we can
+     *  (2.1) read first partial bytes
+     *  (2.2) read full bytes for those whose index is in (first, last), exclusive.
+     *  (2.3) read the last partial bytes ( can be empty )
+     */
+
+    // case (1)
+    if (firstByteIndex == lastByteIndex) {
+      long res = Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + firstByteIndex]);
+      res &= (1L << numBitsToKeepInLastByte) - 1;
+      res >>>= numBitsToExcludeInFirstByte;
+      return res;
+    }
+
+    // case (2)
+    long res = 0;
+    int totalNumBitsRead = 0;
+    // (2.1) read first partial bytes
+    res |=
+        Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + firstByteIndex])
+            >>> numBitsToExcludeInFirstByte;
+    totalNumBitsRead += 8 - numBitsToExcludeInFirstByte;
+    // (2.2) read full bytes for whose index is in (first, last), exclusive.
+    for (int byteIndex = firstByteIndex + 1; byteIndex < lastByteIndex; byteIndex++) {
+      res |= Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + byteIndex]) << totalNumBitsRead;
+      totalNumBitsRead += 8;
+    }
+    // (2.3) read the last partial bytes ( can be empty )
+    if (numBitsToKeepInLastByte > 0) {
+      long partial =
+          Byte.toUnsignedLong(bytesRef.bytes[bytesRef.offset + lastByteIndex])
+              & ((1L << numBitsToKeepInLastByte) - 1);
+      res |= partial << totalNumBitsRead;
+    }
+
+    return res;
+  }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
index 6be829d621ff..298c4a4a419d 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
@@ -19,8 +19,9 @@
 
 import java.util.ArrayList;
 import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPerBytePacker;
 import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
+import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpackerImpl;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
@@ -40,7 +41,7 @@ public void testEncodeDecode() {
     long maxDocStartFPDeltaSeen = -1;
     for (int i = 0; i < random().nextInt(2, 256); i++) {
       var termState = new IntBlockTermState();
-      termState.docFreq = random().nextInt(1, Integer.MAX_VALUE);
+      termState.docFreq = random().nextInt(1, 1 << random().nextInt(1, 31));
       if (i == 0) {
         termState.docStartFP = docStartFPBase;
       } else {
@@ -72,64 +73,60 @@ public void testEncodeDecode() {
     // Assert that each term state is the same after the encode-decode roundtrip.
     BytesRef metadataBytes = new BytesRef(metadata);
     BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes());
-    for (int i = 0; i < termStatesArray.length; i++) {
-      IntBlockTermState decoded =
-          codec.decodeWithinBlock(metadataBytes, dataBytes, bitPerBytePacker, i);
-      assertEquals(termStatesArray[i].docFreq, decoded.docFreq);
-      assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP);
-    }
+    assertBlockRoundTrip(termStatesArray, codec, metadataBytes, dataBytes, bitPerBytePacker);
+
+    // With real compact bits instead of bit-per-byte
+    dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes());
+    assertBlockRoundTrip(
+        termStatesArray, codec, metadataBytes, dataBytes, BitUnpackerImpl.INSTANCE);
 
     // Also test decoding that doesn't begin at the start of the block.
     int pos = random().nextInt(termStatesArray.length);
     int startBitIndex = random().nextInt(pos);
+    int recordSize = expectedDocFreqBitWidth + expectedDocStartFPBitWidth;
+    // With bit-per-byte bytes
     dataBytes =
-        new BytesRef(
-            bitPerBytePacker.getBytes(),
-            pos * (expectedDocFreqBitWidth + expectedDocStartFPBitWidth) - startBitIndex,
-            expectedDocFreqBitWidth + expectedDocStartFPBitWidth);
-    IntBlockTermState decoded =
-        codec.decodeAt(metadataBytes, dataBytes, bitPerBytePacker, startBitIndex);
-    assertEquals(termStatesArray[pos].docFreq, decoded.docFreq);
-    assertEquals(termStatesArray[pos].docStartFP, decoded.docStartFP);
+        new BytesRef(bitPerBytePacker.getBytes(), pos * recordSize - startBitIndex, recordSize);
+    assertDecodeAt(
+        codec, metadataBytes, dataBytes, bitPerBytePacker, startBitIndex, termStatesArray[pos]);
+
+    // With compact bytes
+    int startByteIndex = pos * recordSize / 8;
+    int endByteIndex = (pos + 1) * recordSize / 8;
+    int length = endByteIndex - startByteIndex + ((pos + 1) * recordSize % 8 == 0 ? 0 : 1);
+    dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes(), startByteIndex, length);
+    assertDecodeAt(
+        codec,
+        metadataBytes,
+        dataBytes,
+        BitUnpackerImpl.INSTANCE,
+        (pos * recordSize) % 8,
+        termStatesArray[pos]);
   }
-}
-
-/**
- * A wasteful bit packer that use whole byte to keep a bit. Useful for tests. It uses little-endian
- * bit order.
- */
-class BitPerBytePacker implements BitPacker, BitUnpacker {
-  private final ArrayList<Byte> buffer = new ArrayList<>();
-
-  private int totalNumBits = 0;
 
-  @Override
-  public void add(long value, int numBits) {
-    assert numBits < 64;
-    totalNumBits += numBits;
-    while (numBits-- > 0) {
-      byte b = (byte) (value & 1L);
-      value = value >>> 1;
-      buffer.add(b);
-    }
-  }
-
-  public byte[] getBytes() {
-    byte[] bytes = new byte[totalNumBits];
-    int index = 0;
-    for (var b : buffer) {
-      bytes[index++] = b;
-    }
-
-    return bytes;
+  private static void assertDecodeAt(
+      TermStateCodecImpl codec,
+      BytesRef metadataBytes,
+      BytesRef dataBytes,
+      BitUnpacker bitUnpacker,
+      int startBitIndex,
+      IntBlockTermState termState) {
+    IntBlockTermState decoded =
+        codec.decodeAt(metadataBytes, dataBytes, bitUnpacker, startBitIndex);
+    assertEquals(termState.docFreq, decoded.docFreq);
+    assertEquals(termState.docStartFP, decoded.docStartFP);
   }
 
-  @Override
-  public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) {
-    long res = 0;
-    for (int i = 0; i < bitWidth; i++) {
-      res |= ((long) (bytesRef.bytes[bytesRef.offset + startBitIndex + i] & 1)) << i;
+  private static void assertBlockRoundTrip(
+      IntBlockTermState[] termStatesArray,
+      TermStateCodecImpl codec,
+      BytesRef metadataBytes,
+      BytesRef dataBytes,
+      BitUnpacker bitUnpacker) {
+    for (int i = 0; i < termStatesArray.length; i++) {
+      IntBlockTermState decoded = codec.decodeWithinBlock(metadataBytes, dataBytes, bitUnpacker, i);
+      assertEquals(termStatesArray[i].docFreq, decoded.docFreq);
+      assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP);
     }
-    return res;
   }
 }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java
new file mode 100644
index 000000000000..a1a972d5ceb7
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+
+import java.util.ArrayList;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * A wasteful bit packer that use whole byte to keep a bit. Useful for tests. It uses little-endian
+ * bit order.
+ */
+public class BitPerBytePacker implements BitPacker, BitUnpacker {
+  private final ArrayList<Byte> buffer = new ArrayList<>();
+
+  private int totalNumBits = 0;
+
+  @Override
+  public void add(long value, int numBits) {
+    assert numBits < 64;
+    totalNumBits += numBits;
+    while (numBits-- > 0) {
+      byte b = (byte) (value & 1L);
+      value = value >>> 1;
+      buffer.add(b);
+    }
+  }
+
+  public byte[] getBytes() {
+    byte[] bytes = new byte[totalNumBits];
+    int index = 0;
+    for (var b : buffer) {
+      bytes[index++] = b;
+    }
+
+    return bytes;
+  }
+
+  public byte[] getCompactBytes() {
+    int len = (totalNumBits - 1) / 8 + 1; // round up
+    byte[] bytes = new byte[len];
+
+    int remainingBits = totalNumBits;
+    int pos = 0;
+    while (remainingBits >= 8) {
+      byte b = 0;
+      int base = pos * 8;
+      for (int i = 0; i < 8; i++) {
+        b |= (byte) ((buffer.get(base + i) & 1) << i);
+      }
+      bytes[pos++] = b;
+      remainingBits -= 8;
+    }
+
+    if (remainingBits > 0) {
+      byte b = 0;
+      int base = pos * 8;
+      for (int i = 0; i < remainingBits; i++) {
+        b |= (byte) ((buffer.get(base + i) & 1) << i);
+      }
+      bytes[pos] = b;
+    }
+
+    return bytes;
+  }
+
+  @Override
+  public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) {
+    long res = 0;
+    for (int i = 0; i < bitWidth; i++) {
+      res |= ((long) (bytesRef.bytes[bytesRef.offset + startBitIndex + i] & 1)) << i;
+    }
+    return res;
+  }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java
new file mode 100644
index 000000000000..7c493a661da1
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+
+public class TestBitUnpackerImpl extends LuceneTestCase {
+
+  public void testUnpackBasics() {
+    byte[] bytes = new byte[] {0x21, 0x43, 0x65, (byte) 0x87, (byte) 0xA9};
+    BytesRef bytesRef = new BytesRef(bytes);
+
+    for (int i = 1; i <= 10; i++) {
+      long val = BitUnpackerImpl.INSTANCE.unpack(bytesRef, (i - 1) * 4, 4);
+      assertEquals((long) i, val);
+    }
+  }
+
+  public void testRandom() {
+    ValueAndBitWidth[] expected =
+        random()
+            .longs(1000, 0, Long.MAX_VALUE)
+            .mapToObj(
+                val -> {
+                  int bitWidth = random().nextInt(1, 64);
+                  val &= (1L << bitWidth) - 1;
+                  return new ValueAndBitWidth(val, bitWidth);
+                })
+            .toArray(ValueAndBitWidth[]::new);
+
+    BitPerBytePacker referencePacker = new BitPerBytePacker();
+    for (var x : expected) {
+      referencePacker.add(x.value, x.bitWidth);
+    }
+
+    BytesRef bytes = new BytesRef(referencePacker.getCompactBytes());
+    int startBitIndex = 0;
+    for (var x : expected) {
+      long unpacked = BitUnpackerImpl.INSTANCE.unpack(bytes, startBitIndex, x.bitWidth);
+      startBitIndex += x.bitWidth;
+      assertEquals(x.value, unpacked);
+    }
+  }
+
+  private record ValueAndBitWidth(long value, int bitWidth) {}
+}

From a90f6085facf364487d479c08497dcac3962043e Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 6 Nov 2023 12:22:00 -0800
Subject: [PATCH 14/57] Fix typo and improve error reporting

For those classes
* TermType
* TermsIndexBuilder
---
 .../sandbox/codecs/lucene90/randomaccess/TermType.java      | 2 +-
 .../codecs/lucene90/randomaccess/TermsIndexBuilder.java     | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
index d52cace8545d..793850a931fb 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
@@ -24,7 +24,7 @@
  * TermType holds the classification of a term, based on how its postings are written.
  *
  * <p>It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2)
- * if the term has skip data. 3) if the term as an VINT encoded position block.
+ * if the term has skip data. 3) if the term has an VINT encoded position block.
  */
 final class TermType {
   private static final byte SINGLETON_DOC_MASK = (byte) 1;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
index 8077de7682ce..1fea443c7c16 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
@@ -57,11 +57,13 @@ private long encode(long ord, TermType termType) {
     // so it looks like this |...  ord ...| termType| ... hasOutput  ...|
     // where termType takes 3 bit and hasOutput takes the lowest bit. The rest is taken by ord
     if (ord < 0) {
-      throw new IllegalArgumentException("can't encode negative ord");
+      throw new IllegalArgumentException("can't encode negative ord: " + ord);
     }
     if (ord > MAX_ORD) {
       throw new IllegalArgumentException(
-          "Input ord is too large for TermType: "
+          "Input ord "
+              + ord
+              + " is too large for TermType: "
               + termType.getId()
               + ", max ord allowed is 2^60 - 1");
     }

From a5160abdcdbfbd00ce664c23af207971c5e98ac3 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 6 Nov 2023 14:19:23 -0800
Subject: [PATCH 15/57] Rename the module from lucene90.* to lucene99.* to
 reflect upstream Codec change

---
 lucene/sandbox/src/java/module-info.java      |  4 ++--
 ...RandomAccessDictionaryPostingsFormat.java} | 22 +++++++++----------
 .../Lucene99RandomAccessTermsReader.java}     |  4 ++--
 .../Lucene99RandomAccessTermsWriter.java}     |  4 ++--
 .../randomaccess/TermStateCodec.java          |  8 +++----
 .../randomaccess/TermStateCodecComponent.java |  4 ++--
 .../randomaccess/TermStateCodecImpl.java      |  8 +++----
 .../randomaccess/TermType.java                |  4 ++--
 .../randomaccess/TermsIndex.java              |  2 +-
 .../randomaccess/TermsIndexBuilder.java       |  2 +-
 .../randomaccess/bitpacking/BitPacker.java    |  2 +-
 .../randomaccess/bitpacking/BitUnpacker.java  |  2 +-
 .../bitpacking/BitUnpackerImpl.java           |  2 +-
 .../randomaccess/bitpacking/package-info.java |  2 +-
 .../randomaccess/package-info.java            |  4 ++--
 .../TestTermStateCodecComponent.java          |  4 ++--
 .../randomaccess/TestTermStateCodecImpl.java  | 10 ++++-----
 .../randomaccess/TestTermsIndexBuilder.java   |  2 +-
 .../bitpacking/BitPerBytePacker.java          |  2 +-
 .../bitpacking/TestBitUnpackerImpl.java       |  2 +-
 20 files changed, 47 insertions(+), 47 deletions(-)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java => lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java} (76%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90/randomaccess/Lucene90RandomAccessTermsReader.java => lucene99/randomaccess/Lucene99RandomAccessTermsReader.java} (91%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java => lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java} (89%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermStateCodec.java (90%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermStateCodecComponent.java (97%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermStateCodecImpl.java (95%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermType.java (95%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermsIndex.java (93%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TermsIndexBuilder.java (97%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitPacker.java (93%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitUnpacker.java (94%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitUnpackerImpl.java (97%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/package-info.java (93%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/package-info.java (90%)
 rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TestTermStateCodecComponent.java (95%)
 rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TestTermStateCodecImpl.java (94%)
 rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/TestTermsIndexBuilder.java (97%)
 rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/BitPerBytePacker.java (97%)
 rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/{lucene90 => lucene99}/randomaccess/bitpacking/TestBitUnpackerImpl.java (97%)

diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java
index 59331969cce1..45b66e7c353e 100644
--- a/lucene/sandbox/src/java/module-info.java
+++ b/lucene/sandbox/src/java/module-info.java
@@ -22,8 +22,8 @@
 
   exports org.apache.lucene.payloads;
   exports org.apache.lucene.sandbox.codecs.idversion;
-  exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
-  exports org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
   exports org.apache.lucene.sandbox.document;
   exports org.apache.lucene.sandbox.queries;
   exports org.apache.lucene.sandbox.search;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
similarity index 76%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
index 60c292706a30..59de10be73da 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessDictionaryPostingsFormat.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
 import org.apache.lucene.codecs.FieldsConsumer;
@@ -22,26 +22,26 @@
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsReader;
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.util.IOUtils;
 
 /**
- * Similar to {@link Lucene90PostingsFormat} but with a different term dictionary implementation.
+ * Similar to {@link Lucene99PostingsFormat} but with a different term dictionary implementation.
  *
  * @lucene.experimental
  */
-public final class Lucene90RandomAccessDictionaryPostingsFormat extends PostingsFormat {
+public final class Lucene99RandomAccessDictionaryPostingsFormat extends PostingsFormat {
 
   // Increment version to change it
   static final int VERSION_START = 0;
   static final int VERSION_CURRENT = VERSION_START;
 
   /** Creates {@code Lucene90RandomAccessDictionaryPostingsFormat} */
-  public Lucene90RandomAccessDictionaryPostingsFormat() {
+  public Lucene99RandomAccessDictionaryPostingsFormat() {
     super("Lucene90RandomAccess");
   }
 
@@ -52,10 +52,10 @@ public String toString() {
 
   @Override
   public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    PostingsWriterBase postingsWriter = new Lucene90PostingsWriter(state);
+    PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
     boolean success = false;
     try {
-      FieldsConsumer ret = new Lucene90RandomAccessTermsWriter();
+      FieldsConsumer ret = new Lucene99RandomAccessTermsWriter();
       success = true;
       return ret;
     } finally {
@@ -67,10 +67,10 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException
 
   @Override
   public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
-    PostingsReaderBase postingsReader = new Lucene90PostingsReader(state);
+    PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
     boolean success = false;
     try {
-      FieldsProducer ret = new Lucene90RandomAccessTermsReader();
+      FieldsProducer ret = new Lucene99RandomAccessTermsReader();
       success = true;
       return ret;
     } finally {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
similarity index 91%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
index d5214561bf26..79a63dccf265 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsReader.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
 import java.util.Iterator;
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.index.Terms;
 
-class Lucene90RandomAccessTermsReader extends FieldsProducer {
+class Lucene99RandomAccessTermsReader extends FieldsProducer {
   @Override
   public void close() throws IOException {}
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
similarity index 89%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
index c18a0cbbd143..87b68d2b9c63 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/Lucene90RandomAccessTermsWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
@@ -14,14 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
 import org.apache.lucene.codecs.FieldsConsumer;
 import org.apache.lucene.codecs.NormsProducer;
 import org.apache.lucene.index.Fields;
 
-class Lucene90RandomAccessTermsWriter extends FieldsConsumer {
+class Lucene99RandomAccessTermsWriter extends FieldsConsumer {
   @Override
   public void write(Fields fields, NormsProducer norms) throws IOException {}
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
similarity index 90%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
index a203bdc180e0..a28fb1a94b65 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodec.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
@@ -15,11 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
 import org.apache.lucene.util.BytesRef;
 
 interface TermStateCodec {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
similarity index 97%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
index 9d93f40dc4b0..0740f44ae720 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecComponent.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
@@ -15,9 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 
 abstract class TermStateCodecComponent {
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
similarity index 95%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index eea9e1b149a8..32ccde2fe286 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -15,11 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPacker;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.util.BytesRef;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
similarity index 95%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
index 793850a931fb..c7fbd6089527 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermType.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
@@ -15,10 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.util.Objects;
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 
 /**
  * TermType holds the classification of a term, based on how its postings are written.
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
similarity index 93%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index 94fce6559bc4..a4b67f527275 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import org.apache.lucene.util.fst.FST;
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
similarity index 97%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
index 1fea443c7c16..9484a0505458 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TermsIndexBuilder.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
 import java.util.Arrays;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
similarity index 93%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
index a1828c69a032..a06ca746d245 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
 /** Interface for bit-packing */
 public interface BitPacker {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java
similarity index 94%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java
index 7c9448d893b5..b5af7b40e385 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
 import org.apache.lucene.util.BytesRef;
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
similarity index 97%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
index b4cfd54f584a..44fa6af19887 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitUnpackerImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
 import org.apache.lucene.util.BytesRef;
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java
similarity index 93%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java
index 866d071788ac..8a9078ffa33c 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/package-info.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java
@@ -16,4 +16,4 @@
  */
 
 /** Code for packing and unpacking sequence of non-negative integers with smaller bit width. */
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java
similarity index 90%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java
index d5cf9583f91c..a85027e3b5e1 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/package-info.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java
@@ -16,7 +16,7 @@
  */
 
 /**
- * A PostingFormat that is based on {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat}
+ * A PostingFormat that is based on {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat}
  * but provides random access term dictionary.
  */
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
similarity index 95%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
index 862996fb6c30..15a5e940986c 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecComponent.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
@@ -15,10 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.util.stream.LongStream;
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 import org.apache.lucene.tests.util.LuceneTestCase;
 
 public class TestTermStateCodecComponent extends LuceneTestCase {
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
similarity index 94%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
index 298c4a4a419d..1b7a20fad427 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
@@ -15,13 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.util.ArrayList;
-import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitPerBytePacker;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpacker;
-import org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking.BitUnpackerImpl;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
similarity index 97%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
index 43f4010b1ae6..4b5cf6e58b11 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/TestTermsIndexBuilder.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
 import java.util.HashMap;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
similarity index 97%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
index a1a972d5ceb7..37dec6131975 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/BitPerBytePacker.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
 import java.util.ArrayList;
 import org.apache.lucene.util.BytesRef;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
similarity index 97%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
index 7c493a661da1..f5fc4d12c143 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene90/randomaccess/bitpacking/TestBitUnpackerImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene90.randomaccess.bitpacking;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;

From ece7710ec0a9d7bf871c0590ab8183cb65e9822a Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 6 Nov 2023 16:25:45 -0800
Subject: [PATCH 16/57] Implement compact generic byte-oriented BitPacker

Also with a concrete implementation based on fixed-size byte[]
---
 .../randomaccess/bitpacking/BitPacker.java    |  3 +
 .../bitpacking/BitPackerImplBase.java         | 64 +++++++++++++++++++
 .../FixedSizeByteArrayBitPacker.java          | 41 ++++++++++++
 .../bitpacking/BitPerBytePacker.java          |  5 ++
 .../bitpacking/TestBitPackerImpl.java         | 53 +++++++++++++++
 .../bitpacking/TestBitUnpackerImpl.java       | 21 ++----
 .../bitpacking/ValueAndBitWidth.java          | 35 ++++++++++
 7 files changed, 206 insertions(+), 16 deletions(-)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
index a06ca746d245..06dec80d70dc 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
@@ -22,4 +22,7 @@ public interface BitPacker {
 
   /** Pack the low `numBits` bits of `value` */
   void add(long value, int numBits);
+
+  /** Flush any pending byte */
+  void flush();
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
new file mode 100644
index 000000000000..329192ed2c82
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
+
+/**
+ * Implementation of {@link BitPacker}. The behavior the is abstracted out here is how to write a
+ * byte. This is useful as we can wire the byte-writing to byte[], stream or IndexInput, etc.
+ */
+abstract class BitPackerImplBase implements BitPacker {
+  private long totalNumBytesWritten;
+  private byte buffer;
+  private int bufferNumBitsUsed;
+
+  abstract void writeByte(byte b);
+
+  /** {@inheritDoc}. value could be larger than 2^numBits - 1 but the higher bits won't be used. */
+  @Override
+  public void add(long value, int numBits) {
+    assert numBits < 64;
+    // clear bits higher than `numBits`
+    value &= (1L << numBits) - 1;
+
+    while (numBits > 0) {
+      int bufferNumBitsRemaining = 8 - bufferNumBitsUsed;
+      if (numBits < bufferNumBitsRemaining) {
+        buffer |= (byte) (value << bufferNumBitsUsed);
+        bufferNumBitsUsed += numBits;
+        break;
+      } else {
+        long mask = (1L << bufferNumBitsRemaining) - 1;
+        buffer |= (byte) ((value & mask) << bufferNumBitsUsed);
+        numBits -= bufferNumBitsRemaining;
+        value >>>= bufferNumBitsRemaining;
+        writeByte(buffer);
+        totalNumBytesWritten += 1;
+        buffer = 0;
+        bufferNumBitsUsed = 0;
+      }
+    }
+  }
+
+  @Override
+  public void flush() {
+    if (bufferNumBitsUsed > 0) {
+      writeByte(buffer);
+      bufferNumBitsUsed = 0;
+    }
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
new file mode 100644
index 000000000000..a8be9aca89bd
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
+
+/**
+ * A {@link BitPacker} implementation that requires user to know the size of the resulting byte
+ * array upfront, in order to avoid allocation and copying for dynamically growing the array.
+ */
+public final class FixedSizeByteArrayBitPacker extends BitPackerImplBase {
+  private final byte[] bytes;
+  private int numBytesUsed = 0;
+
+  public FixedSizeByteArrayBitPacker(int capacity) {
+    this.bytes = new byte[capacity];
+  }
+
+  @Override
+  void writeByte(byte b) {
+    assert numBytesUsed < bytes.length;
+    bytes[numBytesUsed++] = b;
+  }
+
+  public byte[] getBytes() {
+    return bytes;
+  }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
index 37dec6131975..2df2a74907e2 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
@@ -40,6 +40,11 @@ public void add(long value, int numBits) {
     }
   }
 
+  @Override
+  public void flush() {
+    // No-op as this impl writes a byte per bit
+  }
+
   public byte[] getBytes() {
     byte[] bytes = new byte[totalNumBits];
     int index = 0;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
new file mode 100644
index 000000000000..84ae93fe4e52
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
+
+import java.util.Arrays;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+public class TestBitPackerImpl extends LuceneTestCase {
+
+  public void testBasic() {
+    FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = new FixedSizeByteArrayBitPacker(5);
+    for (int i = 1; i <= 10; i++) {
+      fixedSizeByteArrayBitPacker.add(i, 4);
+    }
+    fixedSizeByteArrayBitPacker.flush();
+
+    byte[] expectedBytes = new byte[] {0x21, 0x43, 0x65, (byte) 0x87, (byte) 0xA9};
+    assertArrayEquals(expectedBytes, fixedSizeByteArrayBitPacker.getBytes());
+  }
+
+  public void testRandom() {
+    ValueAndBitWidth[] randomInputs = ValueAndBitWidth.getRandomArray(random(), 1000);
+    int totalNumberBits = Arrays.stream(randomInputs).mapToInt(ValueAndBitWidth::bitWidth).sum();
+
+    BitPerBytePacker referencePacker = new BitPerBytePacker();
+    int capacity = totalNumberBits / 8 + (totalNumberBits % 8 == 0 ? 0 : 1);
+    FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker =
+        new FixedSizeByteArrayBitPacker(capacity);
+
+    for (ValueAndBitWidth x : randomInputs) {
+      referencePacker.add(x.value(), x.bitWidth());
+      fixedSizeByteArrayBitPacker.add(x.value(), x.bitWidth());
+    }
+    referencePacker.flush();
+    fixedSizeByteArrayBitPacker.flush();
+    assertArrayEquals(referencePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes());
+  }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
index f5fc4d12c143..2cc106b669e2 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
@@ -33,30 +33,19 @@ public void testUnpackBasics() {
   }
 
   public void testRandom() {
-    ValueAndBitWidth[] expected =
-        random()
-            .longs(1000, 0, Long.MAX_VALUE)
-            .mapToObj(
-                val -> {
-                  int bitWidth = random().nextInt(1, 64);
-                  val &= (1L << bitWidth) - 1;
-                  return new ValueAndBitWidth(val, bitWidth);
-                })
-            .toArray(ValueAndBitWidth[]::new);
+    ValueAndBitWidth[] expected = ValueAndBitWidth.getRandomArray(random(), 1000);
 
     BitPerBytePacker referencePacker = new BitPerBytePacker();
     for (var x : expected) {
-      referencePacker.add(x.value, x.bitWidth);
+      referencePacker.add(x.value(), x.bitWidth());
     }
 
     BytesRef bytes = new BytesRef(referencePacker.getCompactBytes());
     int startBitIndex = 0;
     for (var x : expected) {
-      long unpacked = BitUnpackerImpl.INSTANCE.unpack(bytes, startBitIndex, x.bitWidth);
-      startBitIndex += x.bitWidth;
-      assertEquals(x.value, unpacked);
+      long unpacked = BitUnpackerImpl.INSTANCE.unpack(bytes, startBitIndex, x.bitWidth());
+      startBitIndex += x.bitWidth();
+      assertEquals(x.value(), unpacked);
     }
   }
-
-  private record ValueAndBitWidth(long value, int bitWidth) {}
 }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java
new file mode 100644
index 000000000000..40bee28660fa
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
+
+import java.util.Random;
+
+record ValueAndBitWidth(long value, int bitWidth) {
+
+  static ValueAndBitWidth[] getRandomArray(Random random, int size) {
+    return random
+        .longs(size, 0, Long.MAX_VALUE)
+        .mapToObj(
+            val -> {
+              int bitWidth = random.nextInt(1, 64);
+              val &= (1L << bitWidth) - 1;
+              return new ValueAndBitWidth(val, bitWidth);
+            })
+        .toArray(ValueAndBitWidth[]::new);
+  }
+}

From 0f3b5a17d8c44e738f003f1ab9a733b37235772b Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 6 Nov 2023 16:30:58 -0800
Subject: [PATCH 17/57] Fix issues identfied by precommit checks

https://github.com/apache/lucene/actions/runs/6777264120/job/18420607690?pr=12688
---
 .../lucene99/randomaccess/bitpacking/BitUnpackerImpl.java       | 1 +
 .../codecs/lucene99/randomaccess/TestTermStateCodecImpl.java    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
index 44fa6af19887..84704c0b8787 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
@@ -19,6 +19,7 @@
 
 import org.apache.lucene.util.BytesRef;
 
+/** Implementation of {@link BitUnpacker} that works with compactly packed bits */
 public class BitUnpackerImpl implements BitUnpacker {
   public static BitUnpackerImpl INSTANCE = new BitUnpackerImpl();
 
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
index 1b7a20fad427..bd9249d48b95 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
@@ -82,7 +82,7 @@ public void testEncodeDecode() {
 
     // Also test decoding that doesn't begin at the start of the block.
     int pos = random().nextInt(termStatesArray.length);
-    int startBitIndex = random().nextInt(pos);
+    int startBitIndex = pos > 0 ? random().nextInt(pos) : 0;
     int recordSize = expectedDocFreqBitWidth + expectedDocStartFPBitWidth;
     // With bit-per-byte bytes
     dataBytes =

From cc0751fc78e1a633a319371d0e7b7419f26fee9a Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 6 Nov 2023 19:43:19 -0800
Subject: [PATCH 18/57] Remove unused member field `totalNumBytesWritten`

---
 .../lucene99/randomaccess/bitpacking/BitPackerImplBase.java     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
index 329192ed2c82..5d5aea06dc57 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
@@ -22,7 +22,6 @@
  * byte. This is useful as we can wire the byte-writing to byte[], stream or IndexInput, etc.
  */
 abstract class BitPackerImplBase implements BitPacker {
-  private long totalNumBytesWritten;
   private byte buffer;
   private int bufferNumBitsUsed;
 
@@ -47,7 +46,6 @@ public void add(long value, int numBits) {
         numBits -= bufferNumBitsRemaining;
         value >>>= bufferNumBitsRemaining;
         writeByte(buffer);
-        totalNumBytesWritten += 1;
         buffer = 0;
         bufferNumBitsUsed = 0;
       }

From 39e9e08f54b130974a457ec8827e0635f064b3d0 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 6 Nov 2023 21:59:43 -0800
Subject: [PATCH 19/57] Test TermStateCodecImpl with real compact bit-packer

---
 .../codecs/lucene99/randomaccess/TermStateCodecImpl.java   | 1 +
 .../lucene99/randomaccess/TestTermStateCodecImpl.java      | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index 32ccde2fe286..fc3f9e5e1545 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -53,6 +53,7 @@ public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) {
     for (var termState : inputs) {
       encodeOne(bitPacker, termState, metadataPerComponent);
     }
+    bitPacker.flush();
 
     return metadataBytes;
   }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
index bd9249d48b95..624445515459 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.FixedSizeByteArrayBitPacker;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
@@ -70,6 +71,12 @@ public void testEncodeDecode() {
     ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8);
     assertEquals(docStartFPBase, byteArrayDataInput.readLong());
 
+    // Assert with real bit-packer we get the same bytes
+    FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker =
+        new FixedSizeByteArrayBitPacker(bitPerBytePacker.getCompactBytes().length);
+    codec.encodeBlock(termStatesArray, fixedSizeByteArrayBitPacker);
+    assertArrayEquals(bitPerBytePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes());
+
     // Assert that each term state is the same after the encode-decode roundtrip.
     BytesRef metadataBytes = new BytesRef(metadata);
     BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes());

From 12f9c836f0f5dc5325b391e6b9da5002a49e16c3 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Tue, 7 Nov 2023 14:55:49 -0800
Subject: [PATCH 20/57] Implement TermStateCodecImpl.getCodec for (TermType,
 IndexOptions)

---
 .../randomaccess/TermStateCodecComponent.java |  42 ++++-
 .../randomaccess/TermStateCodecImpl.java      |  84 +++++++++-
 .../lucene99/randomaccess/TermType.java       |   8 +-
 .../randomaccess/TestTermStateCodecImpl.java  | 144 ++++++++++++++++++
 4 files changed, 261 insertions(+), 17 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
index 0740f44ae720..8db1c4e81144 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
@@ -20,6 +20,16 @@
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 
 abstract class TermStateCodecComponent {
+  private final String name;
+
+  TermStateCodecComponent(String name) {
+    this.name = name;
+  }
+
+  @Override
+  public String toString() {
+    return "TermStateCodecComponent{" + "name='" + name + '\'' + '}';
+  }
 
   static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) {
     assert termStates.length > 0;
@@ -44,7 +54,9 @@ static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent
   static final class SingletonDocId extends TermStateCodecComponent {
     public static SingletonDocId INSTANCE = new SingletonDocId();
 
-    private SingletonDocId() {}
+    private SingletonDocId() {
+      super("SingletonDocId");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
@@ -68,7 +80,9 @@ public void setTargetValue(IntBlockTermState termState, long value) {
   static final class DocFreq extends TermStateCodecComponent {
     public static DocFreq INSTANCE = new DocFreq();
 
-    private DocFreq() {}
+    private DocFreq() {
+      super("DocFreq");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
@@ -92,7 +106,9 @@ public void setTargetValue(IntBlockTermState termState, long value) {
   static final class TotalTermFreq extends TermStateCodecComponent {
     public static TotalTermFreq INSTANCE = new TotalTermFreq();
 
-    private TotalTermFreq() {}
+    private TotalTermFreq() {
+      super("TotalTermFreq");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
@@ -113,7 +129,9 @@ public void setTargetValue(IntBlockTermState termState, long value) {
   static final class DocStartFP extends TermStateCodecComponent {
     public static DocStartFP INSTANCE = new DocStartFP();
 
-    private DocStartFP() {}
+    private DocStartFP() {
+      super("DocStartFP");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
@@ -134,7 +152,9 @@ public void setTargetValue(IntBlockTermState termState, long value) {
   static final class PositionStartFP extends TermStateCodecComponent {
     public static PositionStartFP INSTANCE = new PositionStartFP();
 
-    private PositionStartFP() {}
+    private PositionStartFP() {
+      super("PositionStartFP");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
@@ -155,7 +175,9 @@ public void setTargetValue(IntBlockTermState termState, long value) {
   static final class PayloadStartFP extends TermStateCodecComponent {
     public static PayloadStartFP INSTANCE = new PayloadStartFP();
 
-    private PayloadStartFP() {}
+    private PayloadStartFP() {
+      super("PayloadStartFP");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
@@ -176,7 +198,9 @@ public void setTargetValue(IntBlockTermState termState, long value) {
   static final class SkipOffset extends TermStateCodecComponent {
     public static SkipOffset INSTANCE = new SkipOffset();
 
-    private SkipOffset() {}
+    private SkipOffset() {
+      super("SkipOffset");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
@@ -197,7 +221,9 @@ public void setTargetValue(IntBlockTermState termState, long value) {
   static final class LastPositionBlockOffset extends TermStateCodecComponent {
     public static LastPositionBlockOffset INSTANCE = new LastPositionBlockOffset();
 
-    private LastPositionBlockOffset() {}
+    private LastPositionBlockOffset() {
+      super("LastPositionBlockOffset");
+    }
 
     @Override
     public boolean isMonotonicallyIncreasing() {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index fc3f9e5e1545..061f1c866a7b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -17,7 +17,18 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.util.ArrayList;
+import java.util.Arrays;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.DocFreq;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.DocStartFP;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.LastPositionBlockOffset;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.PayloadStartFP;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.PositionStartFP;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.SingletonDocId;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.SkipOffset;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.TotalTermFreq;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
 import org.apache.lucene.store.ByteArrayDataInput;
@@ -28,11 +39,6 @@ final class TermStateCodecImpl implements TermStateCodec {
   private final TermStateCodecComponent[] components;
   private final int metadataBytesLength;
 
-  private static int getMetadataLength(TermStateCodecComponent component) {
-    // 1 byte for bitWidth; optionally 8 byte more for the reference value
-    return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0);
-  }
-
   public TermStateCodecImpl(TermStateCodecComponent[] components) {
     assert components.length > 0;
 
@@ -44,6 +50,74 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) {
     this.metadataBytesLength = metadataBytesLength;
   }
 
+  private static int getMetadataLength(TermStateCodecComponent component) {
+    // 1 byte for bitWidth; optionally 8 byte more for the reference value
+    return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0);
+  }
+
+  public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexOptions) {
+    assert indexOptions.ordinal() > IndexOptions.NONE.ordinal();
+    // A term can't have skip data (has more than one block's worth of doc),
+    // while having a singleton doc at the same time!
+    assert !(termType.hasSkipData() && termType.hasSingletonDoc());
+
+    ArrayList<TermStateCodecComponent> components = new ArrayList<>();
+    // handle docs
+    if (termType.hasSingletonDoc()) {
+      components.add(SingletonDocId.INSTANCE);
+    } else {
+      components.add(DocStartFP.INSTANCE);
+    }
+    // handle skip data
+    if (termType.hasSkipData()) {
+      components.add(SkipOffset.INSTANCE);
+    }
+    // handle docFreq
+    boolean totalTermFeqAdded = false;
+    if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
+      if (termType.hasSingletonDoc()) {
+        components.add(TotalTermFreq.INSTANCE);
+        totalTermFeqAdded = true;
+      } else {
+        components.add(DocFreq.INSTANCE);
+      }
+    }
+    // handle positions
+    if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+      if (!totalTermFeqAdded) {
+        components.add(TotalTermFreq.INSTANCE);
+      }
+      components.add(PositionStartFP.INSTANCE);
+      if (termType.hasLastPositionBlockOffset()) {
+        components.add(LastPositionBlockOffset.INSTANCE);
+      }
+    }
+    // handle payload and offsets
+    if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+      components.add(PayloadStartFP.INSTANCE);
+    }
+
+    return new TermStateCodecImpl(components.toArray(TermStateCodecComponent[]::new));
+  }
+
+  @Override
+  public String toString() {
+    return "TermStateCodecImpl{" + "components=" + Arrays.toString(components) + '}';
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    TermStateCodecImpl that = (TermStateCodecImpl) o;
+    return Arrays.equals(components, that.components);
+  }
+
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(components);
+  }
+
   @Override
   public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) {
     Metadata[] metadataPerComponent = getMetadataPerComponent(inputs);
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
index c7fbd6089527..81e66540c08e 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
@@ -31,7 +31,7 @@ final class TermType {
 
   private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1;
 
-  private static final byte HAS_VINT_POSITION_BLOCK_MASK = (byte) 1 << 2;
+  private static final byte HAS_LAST_POSITION_BLOCK_OFFEST_MASK = (byte) 1 << 2;
 
   public static final int NUM_TOTAL_TYPES = 8;
 
@@ -54,8 +54,8 @@ boolean hasSkipData() {
     return (this.flag & HAS_SKIP_DATA_MASK) > 0;
   }
 
-  boolean hasVintPositionBlock() {
-    return (this.flag & HAS_VINT_POSITION_BLOCK_MASK) > 0;
+  boolean hasLastPositionBlockOffset() {
+    return (this.flag & HAS_LAST_POSITION_BLOCK_OFFEST_MASK) > 0;
   }
 
   static TermType fromTermState(IntBlockTermState state) {
@@ -67,7 +67,7 @@ static TermType fromTermState(IntBlockTermState state) {
       flag |= HAS_SKIP_DATA_MASK;
     }
     if (state.lastPosBlockOffset != -1) {
-      flag |= HAS_VINT_POSITION_BLOCK_MASK;
+      flag |= HAS_LAST_POSITION_BLOCK_OFFEST_MASK;
     }
     return new TermType(flag);
   }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
index 624445515459..175ac30e9407 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl;
@@ -136,4 +137,147 @@ private static void assertBlockRoundTrip(
       assertEquals(termStatesArray[i].docStartFP, decoded.docStartFP);
     }
   }
+
+  public void testGetCodec() {
+    for (IndexOptions indexOptions : IndexOptions.values()) {
+      if (indexOptions == IndexOptions.NONE) {
+        continue;
+      }
+      for (int i = 0; i < 8; i++) {
+        if ((i & 0b011) == 0b011) {
+          continue;
+        }
+        if ((i & 0b100) == 0b100
+            && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+          continue;
+        }
+        TermType termType = TermType.fromId(i);
+        var expected = getExpectedCodec(termType, indexOptions);
+        var got = TermStateCodecImpl.getCodec(termType, indexOptions);
+        assertEquals(expected, got);
+      }
+    }
+  }
+
+  // Enumerate the expected Codec we get for (TermType, IndexOptions) pairs.
+  static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions indexOptions) {
+    ArrayList<TermStateCodecComponent> components = new ArrayList<>();
+    // Wish I can code this better in java...
+    switch (termType.getId()) {
+        // Not singleton doc; No skip data; No last position block offset
+      case 0b000 -> {
+        assert !termType.hasLastPositionBlockOffset()
+            && !termType.hasSkipData()
+            && !termType.hasSingletonDoc();
+        components.add(TermStateCodecComponent.DocStartFP.INSTANCE);
+        if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
+          components.add(TermStateCodecComponent.DocFreq.INSTANCE);
+        }
+        if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+          components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
+          components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        }
+        if (indexOptions.ordinal()
+            >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
+      }
+        // Singleton doc; No skip data; No last position block offset
+      case 0b001 -> {
+        assert !termType.hasLastPositionBlockOffset()
+            && !termType.hasSkipData()
+            && termType.hasSingletonDoc();
+        components.add(TermStateCodecComponent.SingletonDocId.INSTANCE);
+        // If field needs frequency, we need totalTermsFreq.
+        // Since there is only 1 doc, totalTermsFreq == docFreq.
+        if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
+          components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
+        }
+        if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+          components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        }
+        if (indexOptions.ordinal()
+            >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
+      }
+
+        // Not Singleton doc; Has skip data; No last position block offset
+      case 0b010 -> {
+        assert !termType.hasLastPositionBlockOffset()
+            && termType.hasSkipData()
+            && !termType.hasSingletonDoc();
+        components.add(TermStateCodecComponent.DocStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.SkipOffset.INSTANCE);
+        if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
+          components.add(TermStateCodecComponent.DocFreq.INSTANCE);
+        }
+        if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+          components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
+          components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        }
+        if (indexOptions.ordinal()
+            >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
+      }
+        // Singleton doc but has skip data; Invalid state.
+      case 0b011, 0b111 -> {
+        assert termType.hasSkipData() && termType.hasSingletonDoc();
+        throw new IllegalStateException(
+            "Unreachable. A term has skip data but also only has one doc!? Must be a bug");
+      }
+        // Not singleton doc; No skip data; Has last position block offset;
+      case 0b100 -> {
+        assert termType.hasLastPositionBlockOffset()
+            && !termType.hasSkipData()
+            && !termType.hasSingletonDoc();
+        assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal();
+        components.add(TermStateCodecComponent.DocStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.DocFreq.INSTANCE);
+        components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
+        components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE);
+        if (indexOptions.ordinal()
+            >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
+      }
+        // Singleton doc; No skip data; Has last position block offset;
+      case 0b101 -> {
+        assert termType.hasLastPositionBlockOffset()
+            && !termType.hasSkipData()
+            && termType.hasSingletonDoc();
+        assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal();
+        components.add(TermStateCodecComponent.SingletonDocId.INSTANCE);
+        components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
+        components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE);
+        if (indexOptions.ordinal()
+            >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
+      }
+        // Not singleton doc; Has skip data; Has last position block offset;
+      case 0b110 -> {
+        assert termType.hasLastPositionBlockOffset()
+            && termType.hasSkipData()
+            && !termType.hasSingletonDoc();
+        assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal();
+        components.add(TermStateCodecComponent.DocStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.SkipOffset.INSTANCE);
+        components.add(TermStateCodecComponent.DocFreq.INSTANCE);
+        components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
+        components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE);
+        if (indexOptions.ordinal()
+            >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
+      }
+      default -> throw new IllegalStateException("Unreachable");
+    }
+
+    return new TermStateCodecImpl(components.toArray(TermStateCodecComponent[]::new));
+  }
 }

From 402965ff5d453754ec02cd15ba5ecad78c63db39 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 9 Nov 2023 13:42:35 -0800
Subject: [PATCH 21/57] Implement term (type, ord) lookup in TermsIndex

---
 .../codecs/lucene99/randomaccess/TermsIndex.java  | 15 ++++++++++++++-
 .../randomaccess/TestTermsIndexBuilder.java       | 11 +++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index a4b67f527275..e43e495a48ef 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -17,6 +17,19 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Util;
 
-record TermsIndex(FST<Long> fst) {}
+record TermsIndex(FST<Long> fst) {
+
+  TypeAndOrd getTerm(BytesRef term) throws IOException {
+    long encoded = Util.get(fst, term);
+    TermType termType = TermType.fromId((int) ((encoded & 0b1110L) >>> 1));
+    long ord = encoded >>> 4;
+    return new TypeAndOrd(termType, ord);
+  }
+
+  public record TypeAndOrd(TermType termType, long ord) {}
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
index 4b5cf6e58b11..7179c23d1d7e 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
@@ -22,8 +22,6 @@
 import java.util.Map;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.Util;
 
 public class TestTermsIndexBuilder extends LuceneTestCase {
 
@@ -51,15 +49,12 @@ public void testBasics() throws IOException {
     }
     TermsIndex termsIndex = builder.build();
 
-    FST<Long> fst = termsIndex.fst();
-
     for (String term : test_terms) {
       BytesRef termBytes = new BytesRef(term);
-      long encoded = Util.get(fst, termBytes);
+      TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(termBytes);
 
-      assertEquals(1L, encoded & 0b1L);
-      assertEquals((long) termsToType.get(term), (encoded & 0b1110L) >> 1);
-      assertEquals((long) termsToOrd.get(term), encoded >> 4);
+      assertEquals(termsToType.get(term).intValue(), typeAndOrd.termType().getId());
+      assertEquals((long) termsToOrd.get(term), typeAndOrd.ord());
     }
   }
 }

From 3ce5ea9196e1ea56e69d228cbf2d777a5a27f114 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 13 Nov 2023 10:54:47 -0800
Subject: [PATCH 22/57] create sub-package `termdict` to hold term dictionary
 implementions

---
 lucene/sandbox/src/java/module-info.java       |  1 +
 .../codecs/lucene99/randomaccess/TermType.java | 14 +++++++-------
 .../{ => termdict}/TermsIndex.java             |  3 ++-
 .../{ => termdict}/TermsIndexBuilder.java      |  3 ++-
 .../randomaccess/termdict/package-info.java    | 18 ++++++++++++++++++
 .../{ => termdict}/TestTermsIndexBuilder.java  |  3 ++-
 6 files changed, 32 insertions(+), 10 deletions(-)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{ => termdict}/TermsIndex.java (89%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{ => termdict}/TermsIndexBuilder.java (95%)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java
 rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{ => termdict}/TestTermsIndexBuilder.java (93%)

diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java
index 45b66e7c353e..2d9d6d31fc65 100644
--- a/lucene/sandbox/src/java/module-info.java
+++ b/lucene/sandbox/src/java/module-info.java
@@ -28,6 +28,7 @@
   exports org.apache.lucene.sandbox.queries;
   exports org.apache.lucene.sandbox.search;
   exports org.apache.lucene.sandbox.index;
+  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
 
   provides org.apache.lucene.codecs.PostingsFormat with
       org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
index 81e66540c08e..69e1150391a2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
@@ -26,7 +26,7 @@
  * <p>It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2)
  * if the term has skip data. 3) if the term has an VINT encoded position block.
  */
-final class TermType {
+public final class TermType {
   private static final byte SINGLETON_DOC_MASK = (byte) 1;
 
   private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1;
@@ -41,24 +41,24 @@ private TermType(byte flag) {
     this.flag = flag;
   }
 
-  int getId() {
+  public int getId() {
     assert this.flag >= 0 && this.flag <= 8;
     return this.flag;
   }
 
-  boolean hasSingletonDoc() {
+  public boolean hasSingletonDoc() {
     return (this.flag & SINGLETON_DOC_MASK) > 0;
   }
 
-  boolean hasSkipData() {
+  public boolean hasSkipData() {
     return (this.flag & HAS_SKIP_DATA_MASK) > 0;
   }
 
-  boolean hasLastPositionBlockOffset() {
+  public boolean hasLastPositionBlockOffset() {
     return (this.flag & HAS_LAST_POSITION_BLOCK_OFFEST_MASK) > 0;
   }
 
-  static TermType fromTermState(IntBlockTermState state) {
+  public static TermType fromTermState(IntBlockTermState state) {
     byte flag = 0;
     if (state.singletonDocID != -1) {
       flag |= SINGLETON_DOC_MASK;
@@ -72,7 +72,7 @@ static TermType fromTermState(IntBlockTermState state) {
     return new TermType(flag);
   }
 
-  static TermType fromId(int id) {
+  public static TermType fromId(int id) {
     if (id < 0 || id > 8) {
       throw new IllegalArgumentException("id must be within range [0, 8]");
     }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java
similarity index 89%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java
index e43e495a48ef..0c2035c6e715 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
 
 import java.io.IOException;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.Util;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java
similarity index 95%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java
index 9484a0505458..a49d00566bd2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java
@@ -15,10 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
 
 import java.io.IOException;
 import java.util.Arrays;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.FST;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java
new file mode 100644
index 000000000000..45b415b7f510
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java
@@ -0,0 +1,18 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/** Class for term dictionary implementation. */
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java
similarity index 93%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java
index 7179c23d1d7e..d1a665ed9867 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java
@@ -15,11 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
 
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
 

From fd9beca452f0fef4870414c0e841d9656c642bd3 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 13 Nov 2023 13:54:27 -0800
Subject: [PATCH 23/57] Revert "create sub-package `termdict` to hold term
 dictionary implementions"

This reverts commit 3ce5ea9196e1ea56e69d228cbf2d777a5a27f114.

Reverting because I want to reduce what gets exposed to the rest of the project
---
 lucene/sandbox/src/java/module-info.java       |  1 -
 .../codecs/lucene99/randomaccess/TermType.java | 14 +++++++-------
 .../{termdict => }/TermsIndex.java             |  3 +--
 .../{termdict => }/TermsIndexBuilder.java      |  3 +--
 .../randomaccess/termdict/package-info.java    | 18 ------------------
 .../{termdict => }/TestTermsIndexBuilder.java  |  3 +--
 6 files changed, 10 insertions(+), 32 deletions(-)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{termdict => }/TermsIndex.java (89%)
 rename lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{termdict => }/TermsIndexBuilder.java (95%)
 delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java
 rename lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/{termdict => }/TestTermsIndexBuilder.java (93%)

diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java
index 2d9d6d31fc65..45b66e7c353e 100644
--- a/lucene/sandbox/src/java/module-info.java
+++ b/lucene/sandbox/src/java/module-info.java
@@ -28,7 +28,6 @@
   exports org.apache.lucene.sandbox.queries;
   exports org.apache.lucene.sandbox.search;
   exports org.apache.lucene.sandbox.index;
-  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
 
   provides org.apache.lucene.codecs.PostingsFormat with
       org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
index 69e1150391a2..81e66540c08e 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
@@ -26,7 +26,7 @@
  * <p>It captures -- 1) if a term has a singleton docid (i.e. only one doc contains this term). 2)
  * if the term has skip data. 3) if the term has an VINT encoded position block.
  */
-public final class TermType {
+final class TermType {
   private static final byte SINGLETON_DOC_MASK = (byte) 1;
 
   private static final byte HAS_SKIP_DATA_MASK = (byte) 1 << 1;
@@ -41,24 +41,24 @@ private TermType(byte flag) {
     this.flag = flag;
   }
 
-  public int getId() {
+  int getId() {
     assert this.flag >= 0 && this.flag <= 8;
     return this.flag;
   }
 
-  public boolean hasSingletonDoc() {
+  boolean hasSingletonDoc() {
     return (this.flag & SINGLETON_DOC_MASK) > 0;
   }
 
-  public boolean hasSkipData() {
+  boolean hasSkipData() {
     return (this.flag & HAS_SKIP_DATA_MASK) > 0;
   }
 
-  public boolean hasLastPositionBlockOffset() {
+  boolean hasLastPositionBlockOffset() {
     return (this.flag & HAS_LAST_POSITION_BLOCK_OFFEST_MASK) > 0;
   }
 
-  public static TermType fromTermState(IntBlockTermState state) {
+  static TermType fromTermState(IntBlockTermState state) {
     byte flag = 0;
     if (state.singletonDocID != -1) {
       flag |= SINGLETON_DOC_MASK;
@@ -72,7 +72,7 @@ public static TermType fromTermState(IntBlockTermState state) {
     return new TermType(flag);
   }
 
-  public static TermType fromId(int id) {
+  static TermType fromId(int id) {
     if (id < 0 || id > 8) {
       throw new IllegalArgumentException("id must be within range [0, 8]");
     }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
similarity index 89%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index 0c2035c6e715..e43e495a48ef 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -15,10 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
-import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.Util;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
similarity index 95%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java
rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
index a49d00566bd2..9484a0505458 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TermsIndexBuilder.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
@@ -15,11 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
 import java.util.Arrays;
-import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.FST;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java
deleted file mode 100644
index 45b415b7f510..000000000000
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/** Class for term dictionary implementation. */
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
similarity index 93%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
index d1a665ed9867..7179c23d1d7e 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/termdict/TestTermsIndexBuilder.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
@@ -15,12 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.termdict;
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
-import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
 

From b3bf288dacea74e38abcf1a0d5e9256410b0fd17 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 16 Nov 2023 14:52:26 -0800
Subject: [PATCH 24/57] Setup sketch implementations for RandomAccessTermsDict

This commit implemented following building blocks:
* TermData -- represent the bitpacked termstate data. it exposes a get by term ordinal API.
* TermDataWriter -- incrementally write termstate data as index files.

misc. extended a few interfaces to expose information needed to implment term data lookup.
---
 .../randomaccess/ByteArrayByteSlice.java      |  55 ++++++++
 .../lucene99/randomaccess/ByteSlice.java      |  32 +++++
 .../RandomAccessInputByteSlice.java           |  58 ++++++++
 .../randomaccess/RandomAccessTermsDict.java   |  22 ++++
 .../lucene99/randomaccess/TermData.java       |  94 +++++++++++++
 .../lucene99/randomaccess/TermDataWriter.java |  94 +++++++++++++
 .../lucene99/randomaccess/TermStateCodec.java |  20 ++-
 .../randomaccess/TermStateCodecComponent.java |   7 +-
 .../randomaccess/TermStateCodecImpl.java      |  29 ++--
 .../lucene99/randomaccess/TermsDataStore.java |  48 +++++++
 .../lucene99/randomaccess/TermsImpl.java      | 101 ++++++++++++++
 .../lucene99/randomaccess/TermsIndex.java     |  19 +++
 .../lucene99/randomaccess/TermsStats.java     |  29 ++++
 .../randomaccess/bitpacking/BitPacker.java    |   6 +-
 .../bitpacking/BitPackerImplBase.java         |   8 +-
 .../bitpacking/DataOutputBitPacker.java       |  44 +++++++
 .../randomaccess/TestTermDataWriter.java      | 124 ++++++++++++++++++
 .../TestTermStateCodecComponent.java          |   5 +-
 .../randomaccess/TestTermStateCodecImpl.java  |  99 ++++++++------
 .../bitpacking/BitPerBytePacker.java          |   4 +
 .../bitpacking/TestBitPackerImpl.java         |   5 +-
 21 files changed, 846 insertions(+), 57 deletions(-)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
new file mode 100644
index 000000000000..55139ebf3a32
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.BitUtil;
+
+final class ByteArrayByteSlice implements ByteSlice {
+  private final byte[] bytes;
+
+  ByteArrayByteSlice(byte[] bytes) {
+    this.bytes = bytes;
+  }
+
+  @Override
+  public long size() {
+    return bytes.length;
+  }
+
+  @Override
+  public void writeAll(DataOutput output) throws IOException {
+    output.writeBytes(bytes, bytes.length);
+  }
+
+  @Override
+  public long getLong(long pos) {
+    return (long) BitUtil.VH_LE_LONG.get(bytes, (int) pos);
+  }
+
+  @Override
+  public byte[] getBytes(long pos, int length) {
+    if (length == 0) {
+      return new byte[0];
+    }
+    byte[] result = new byte[length];
+    System.arraycopy(bytes, (int) pos, result, 0, length);
+    return result;
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
new file mode 100644
index 000000000000..937e915e3325
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.store.DataOutput;
+
+/** A slice of bytes */
+interface ByteSlice {
+  long size();
+
+  void writeAll(DataOutput output) throws IOException;
+
+  long getLong(long pos) throws IOException;
+
+  byte[] getBytes(long pos, int length) throws IOException;
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
new file mode 100644
index 000000000000..3d80e50dd383
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.RandomAccessInput;
+
+final class RandomAccessInputByteSlice implements ByteSlice {
+  private final RandomAccessInput randomAccessInput;
+
+  RandomAccessInputByteSlice(RandomAccessInput randomAccessInput) {
+    this.randomAccessInput = randomAccessInput;
+  }
+
+  @Override
+  public long size() {
+    return randomAccessInput.length();
+  }
+
+  @Override
+  public void writeAll(DataOutput output) throws IOException {
+    for (long pos = 0; pos < randomAccessInput.length(); pos++) {
+      // For buffered inputs and outputs this should be fine.
+      output.writeByte(randomAccessInput.readByte(pos));
+    }
+  }
+
+  @Override
+  public long getLong(long pos) throws IOException {
+    return randomAccessInput.readLong(pos);
+  }
+
+  @Override
+  public byte[] getBytes(long pos, int length) throws IOException {
+    if (length == 0) {
+      return new byte[0];
+    }
+    byte[] result = new byte[length];
+    randomAccessInput.readBytes(pos, result, 0, length);
+    return result;
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
new file mode 100644
index 000000000000..26451dd9f938
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+/** A term dictionary that offer random-access to read a specific term */
+record RandomAccessTermsDict(
+    TermsStats termsStats, TermsIndex termsIndex, TermsDataStore termsDataStore) {}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
new file mode 100644
index 000000000000..4e8f79738e59
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Holds the bit-packed {@link IntBlockTermState} for a given {@link
+ * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType}
+ */
+record TermData(TermType termType, ByteSlice metadata, ByteSlice data) {
+
+  IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException {
+    long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK;
+    long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8);
+    long dataStartPos = metadata.getLong(metadataStartPos);
+    BytesRef metadataBytesRef =
+        new BytesRef(metadata.getBytes(metadataStartPos + 8, codec.getMetadataBytesLength()));
+
+    int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef);
+    int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK));
+    int startBitIndex = dataBitIndex % 8;
+    int numBytesToRead = (startBitIndex + numBitsPerRecord) / 8;
+    if ((startBitIndex + numBitsPerRecord) % 8 > 0) {
+      numBytesToRead += 1;
+    }
+    BytesRef dataBytesRef =
+        new BytesRef(data.getBytes(dataStartPos + dataBitIndex / 8, numBytesToRead));
+
+    return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex);
+  }
+
+  static TermData deserializeOnHeap(
+      DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException {
+    TermType termType = TermType.fromId(metaInput.readByte());
+    long metadataSize = metaInput.readVLong();
+    long dataSize = metaInput.readVLong();
+
+    if (metadataSize > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException(
+          "Metadata size it too large to store on heap. Must be less than " + Integer.MAX_VALUE);
+    }
+    if (dataSize > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException(
+          "Data size it too large to store on heap.Must be less than " + Integer.MAX_VALUE);
+    }
+
+    byte[] metadataBytes = new byte[(int) metadataSize];
+    byte[] dataBytes = new byte[(int) dataSize];
+
+    metadataInput.readBytes(metadataBytes, 0, metadataBytes.length);
+    dataInput.readBytes(dataBytes, 0, dataBytes.length);
+
+    return new TermData(
+        termType, new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes));
+  }
+
+  static TermData deserializeOffHeap(
+      DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException {
+    TermType termType = TermType.fromId(metaInput.readByte());
+    long metadataSize = metaInput.readVLong();
+    long dataSize = metaInput.readVLong();
+
+    RandomAccessInput metadata =
+        metadataInput.randomAccessSlice(metadataInput.getFilePointer(), metadataSize);
+    metadataInput.skipBytes(metadataSize);
+    RandomAccessInput data = dataInput.randomAccessSlice(dataInput.getFilePointer(), dataSize);
+    dataInput.skipBytes(dataSize);
+
+    return new TermData(
+        termType, new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data));
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
new file mode 100644
index 000000000000..09ab3cba9242
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.DataOutputBitPacker;
+import org.apache.lucene.store.DataOutput;
+
+/** Writes TermData to */
+final class TermDataWriter {
+  static final int NUM_TERMS_PER_BLOCK = 256;
+
+  private final TermStateCodec termStateCodec;
+
+  private final IntBlockTermStateBuffer buffer = new IntBlockTermStateBuffer(NUM_TERMS_PER_BLOCK);
+
+  private final DataOutput metadataOut;
+  private final DataOutputBitPacker dataOutputBitPacker;
+
+  private long totalMetaDataBytesWritten;
+
+  TermDataWriter(TermStateCodec termStateCodec, DataOutput metadataOut, DataOutput dataOut) {
+    this.termStateCodec = termStateCodec;
+    this.metadataOut = metadataOut;
+    this.dataOutputBitPacker = new DataOutputBitPacker(dataOut);
+  }
+
+  void addTermState(IntBlockTermState termState) throws IOException {
+    buffer.add(termState);
+    if (buffer.numUsed == NUM_TERMS_PER_BLOCK) {
+      writeBlock();
+    }
+  }
+
+  void finish() throws IOException {
+    if (buffer.numUsed > 0) {
+      writeBlock();
+    }
+  }
+
+  long getTotalMetaDataBytesWritten() {
+    return totalMetaDataBytesWritten;
+  }
+
+  long getTotalDataBytesWritten() {
+    return dataOutputBitPacker.getNumBytesWritten();
+  }
+
+  private void writeBlock() throws IOException {
+    metadataOut.writeLong(dataOutputBitPacker.getNumBytesWritten());
+    byte[] metadata =
+        termStateCodec.encodeBlockUpTo(buffer.elements, buffer.numUsed, dataOutputBitPacker);
+    metadataOut.writeBytes(metadata, metadata.length);
+    totalMetaDataBytesWritten += metadata.length + 8;
+    buffer.clear();
+  }
+
+  /** act like a minial ArrayList, but provide access to the internal array */
+  static class IntBlockTermStateBuffer {
+    IntBlockTermState[] elements;
+    int numUsed;
+
+    IntBlockTermStateBuffer(int capacity) {
+      this.elements = new IntBlockTermState[capacity];
+    }
+
+    void add(IntBlockTermState termState) {
+      elements[numUsed++] = termState;
+    }
+
+    void clear() {
+      for (int i = 0; i < numUsed; i++) {
+        elements[i] = null;
+      }
+      numUsed = 0;
+    }
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
index a28fb1a94b65..283512c7ae6a 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
@@ -17,6 +17,7 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.io.IOException;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
@@ -24,13 +25,30 @@
 
 interface TermStateCodec {
 
+  /** Get the number of bytes that the metadata per block needs. */
+  int getMetadataBytesLength();
+
+  /** Get the number of bits per data record within the block, based on the provided metadata. */
+  int getNumBitsPerRecord(BytesRef metadataBytes);
+
   /**
    * Encode the sequence of {@link IntBlockTermState}s with the given bitPacker into a block of
    * bytes.
    *
    * @return the metadata associated with the encoded bytes
    */
-  byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker);
+  default byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) throws IOException {
+    return encodeBlockUpTo(inputs, inputs.length, bitPacker);
+  }
+
+  /**
+   * Encode the sequence of {@link IntBlockTermState}s up to length, with the given bitPacker into a
+   * block of bytes.
+   *
+   * @return the metadata associated with the encoded bytes
+   */
+  byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int upto, BitPacker bitPacker)
+      throws IOException;
 
   /**
    * Decode out a {@link IntBlockTermState} with the provided bit-unpacker, metadata byte slice and
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
index 8db1c4e81144..8545cce8e8c3 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
@@ -31,14 +31,17 @@ public String toString() {
     return "TermStateCodecComponent{" + "name='" + name + '\'' + '}';
   }
 
-  static byte getBitWidth(IntBlockTermState[] termStates, TermStateCodecComponent component) {
+  static byte getBitWidth(
+      IntBlockTermState[] termStates, int upTo, TermStateCodecComponent component) {
     assert termStates.length > 0;
+    assert upTo > 0 && upTo <= termStates.length;
 
     long maxValSeen = -1;
     long referenceValue =
         component.isMonotonicallyIncreasing() ? component.getTargetValue(termStates[0]) : 0;
 
-    for (var termState : termStates) {
+    for (int i = 0; i < upTo; i++) {
+      var termState = termStates[i];
       maxValSeen = Math.max(maxValSeen, component.getTargetValue(termState) - referenceValue);
     }
     return (byte) (64 - Long.numberOfLeadingZeros(maxValSeen));
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index 061f1c866a7b..3dc0a69f0c05 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -17,6 +17,7 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
@@ -50,6 +51,16 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) {
     this.metadataBytesLength = metadataBytesLength;
   }
 
+  @Override
+  public int getMetadataBytesLength() {
+    return metadataBytesLength;
+  }
+
+  @Override
+  public int getNumBitsPerRecord(BytesRef metadataBytes) {
+    return deserializedMetadata(metadataBytes).totalBitsPerTermState;
+  }
+
   private static int getMetadataLength(TermStateCodecComponent component) {
     // 1 byte for bitWidth; optionally 8 byte more for the reference value
     return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0);
@@ -119,24 +130,25 @@ public int hashCode() {
   }
 
   @Override
-  public byte[] encodeBlock(IntBlockTermState[] inputs, BitPacker bitPacker) {
-    Metadata[] metadataPerComponent = getMetadataPerComponent(inputs);
+  public byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int uptop, BitPacker bitPacker)
+      throws IOException {
+    Metadata[] metadataPerComponent = getMetadataPerComponent(inputs, uptop);
     byte[] metadataBytes = serializeMetadata(metadataPerComponent);
 
     // Encode inputs via the bitpacker
-    for (var termState : inputs) {
-      encodeOne(bitPacker, termState, metadataPerComponent);
+    for (int i = 0; i < uptop; i++) {
+      encodeOne(bitPacker, inputs[i], metadataPerComponent);
     }
     bitPacker.flush();
 
     return metadataBytes;
   }
 
-  private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs) {
-    Metadata[] metadataPerComponent = new Metadata[components.length];
+  private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs, int upTo) {
+    Metadata[] metadataPerComponent = new Metadata[upTo];
     for (int i = 0; i < components.length; i++) {
       var component = components[i];
-      byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, component);
+      byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, upTo, component);
       long referenceValue =
           component.isMonotonicallyIncreasing() ? component.getTargetValue(inputs[0]) : 0L;
       metadataPerComponent[i] = new Metadata(bitWidth, referenceValue);
@@ -159,7 +171,8 @@ private byte[] serializeMetadata(Metadata[] metadataPerComponent) {
   }
 
   private void encodeOne(
-      BitPacker bitPacker, IntBlockTermState termState, Metadata[] metadataPerComponent) {
+      BitPacker bitPacker, IntBlockTermState termState, Metadata[] metadataPerComponent)
+      throws IOException {
     for (int i = 0; i < components.length; i++) {
       var component = components[i];
       var metadata = metadataPerComponent[i];
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java
new file mode 100644
index 000000000000..1717d26aa780
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.util.Arrays;
+
+/** Holds all {@link TermData} for all {@link TermType} for a field. */
+class TermsDataStore {
+  private final TermData[] dataPerTermType;
+
+  private TermsDataStore(TermData[] dataPerTermType) {
+    this.dataPerTermType = dataPerTermType;
+  }
+
+  static class Builder {
+    private final TermData[] dataPerTermType;
+
+    Builder() {
+      dataPerTermType = new TermData[TermType.NUM_TOTAL_TYPES];
+      Arrays.fill(dataPerTermType, null);
+    }
+
+    void add(TermData termData) {
+      assert dataPerTermType[termData.termType().getId()] == null;
+
+      dataPerTermType[termData.termType().getId()] = termData;
+    }
+
+    TermsDataStore build() {
+      return new TermsDataStore(dataPerTermType);
+    }
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
new file mode 100644
index 000000000000..29a9c4124e7b
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+
+class TermsImpl extends Terms {
+  private final FieldInfo fieldInfo;
+
+  private final RandomAccessTermsDict termsDict;
+
+  public TermsImpl(TermsStats stats, FieldInfo fieldInfo, RandomAccessTermsDict termsDict) {
+    this.fieldInfo = fieldInfo;
+    this.termsDict = termsDict;
+  }
+
+  @Override
+  public long size() throws IOException {
+    return termsDict.termsStats().size();
+  }
+
+  @Override
+  public long getSumTotalTermFreq() throws IOException {
+    return termsDict.termsStats().sumTotalTermFreq();
+  }
+
+  @Override
+  public long getSumDocFreq() throws IOException {
+    return termsDict.termsStats().sumDocFreq();
+  }
+
+  @Override
+  public int getDocCount() throws IOException {
+    return termsDict.termsStats().docCount();
+  }
+
+  @Override
+  public boolean hasFreqs() {
+    return fieldInfo.getIndexOptions().ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal();
+  }
+
+  @Override
+  public boolean hasOffsets() {
+    return fieldInfo.getIndexOptions().ordinal()
+        >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal();
+  }
+
+  @Override
+  public boolean hasPositions() {
+    return fieldInfo.getIndexOptions().ordinal()
+        >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal();
+  }
+
+  @Override
+  public boolean hasPayloads() {
+    return fieldInfo.hasPayloads();
+  }
+
+  @Override
+  public BytesRef getMin() throws IOException {
+    return termsDict.termsStats().minTerm();
+  }
+
+  @Override
+  public BytesRef getMax() throws IOException {
+    return termsDict.termsStats().maxTerm();
+  }
+
+  @Override
+  public TermsEnum iterator() throws IOException {
+    // TODO: implement me
+    return null;
+  }
+
+  @Override
+  public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+    // TODO: implement me
+    return null;
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index e43e495a48ef..917989b51b43 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -18,8 +18,12 @@
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.OffHeapFSTStore;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util;
 
 record TermsIndex(FST<Long> fst) {
@@ -32,4 +36,19 @@ TypeAndOrd getTerm(BytesRef term) throws IOException {
   }
 
   public record TypeAndOrd(TermType termType, long ord) {}
+
+  public void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException {
+    fst.save(metaOut, dataOut);
+  }
+
+  public TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap)
+      throws IOException {
+    FST<Long> fst;
+    if (loadOffHeap) {
+      fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), new OffHeapFSTStore());
+    } else {
+      fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton());
+    }
+    return new TermsIndex(fst);
+  }
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
new file mode 100644
index 000000000000..d5156d7455e9
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import org.apache.lucene.util.BytesRef;
+
+/** Data class that holds starts for term stats for a field */
+record TermsStats(
+    long size,
+    long sumTotalTermFreq,
+    long sumDocFreq,
+    int docCount,
+    BytesRef minTerm,
+    BytesRef maxTerm) {}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
index 06dec80d70dc..1ad8b0fb36e8 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
@@ -17,12 +17,14 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
+import java.io.IOException;
+
 /** Interface for bit-packing */
 public interface BitPacker {
 
   /** Pack the low `numBits` bits of `value` */
-  void add(long value, int numBits);
+  void add(long value, int numBits) throws IOException;
 
   /** Flush any pending byte */
-  void flush();
+  void flush() throws IOException;
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
index 5d5aea06dc57..dc405c717072 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
@@ -17,6 +17,8 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
+import java.io.IOException;
+
 /**
  * Implementation of {@link BitPacker}. The behavior the is abstracted out here is how to write a
  * byte. This is useful as we can wire the byte-writing to byte[], stream or IndexInput, etc.
@@ -25,11 +27,11 @@ abstract class BitPackerImplBase implements BitPacker {
   private byte buffer;
   private int bufferNumBitsUsed;
 
-  abstract void writeByte(byte b);
+  abstract void writeByte(byte b) throws IOException;
 
   /** {@inheritDoc}. value could be larger than 2^numBits - 1 but the higher bits won't be used. */
   @Override
-  public void add(long value, int numBits) {
+  public void add(long value, int numBits) throws IOException {
     assert numBits < 64;
     // clear bits higher than `numBits`
     value &= (1L << numBits) - 1;
@@ -53,7 +55,7 @@ public void add(long value, int numBits) {
   }
 
   @Override
-  public void flush() {
+  public void flush() throws IOException {
     if (bufferNumBitsUsed > 0) {
       writeByte(buffer);
       bufferNumBitsUsed = 0;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java
new file mode 100644
index 000000000000..8e92b9faa326
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
+
+import java.io.IOException;
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * A {@link BitPacker} implementation that writes to a {@link org.apache.lucene.store.DataOutput}
+ */
+public final class DataOutputBitPacker extends BitPackerImplBase {
+  private final DataOutput dataOut;
+
+  private long numBytesWritten;
+
+  public DataOutputBitPacker(DataOutput dataOut) {
+    this.dataOut = dataOut;
+  }
+
+  @Override
+  void writeByte(byte b) throws IOException {
+    dataOut.writeByte(b);
+    numBytesWritten++;
+  }
+
+  public long getNumBytesWritten() {
+    return numBytesWritten;
+  }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
new file mode 100644
index 000000000000..aab496a41937
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TestTermStateCodecImpl.TermStateTestFixture;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+public class TestTermDataWriter extends LuceneTestCase {
+
+  public void testWriterAndDeserialize() throws IOException {
+    TermStateTestFixture testFixture = TestTermStateCodecImpl.getTermStateTestFixture(777);
+    TermType expectedTermType = TermType.fromId(7);
+
+    try (Directory testDir = newDirectory()) {
+      IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT);
+      IndexOutput metadataOut = testDir.createOutput("term_meta_1", IOContext.DEFAULT);
+      IndexOutput dataOut = testDir.createOutput("term_data_11", IOContext.DEFAULT);
+      TermDataWriter writer = new TermDataWriter(testFixture.codec(), metadataOut, dataOut);
+      for (var termState : testFixture.termStatesArray()) {
+        writer.addTermState(termState);
+      }
+      writer.finish();
+      metaOut.writeByte((byte) expectedTermType.getId());
+      metaOut.writeVLong(writer.getTotalMetaDataBytesWritten());
+      metaOut.writeVLong(writer.getTotalDataBytesWritten());
+      metaOut.close();
+      metadataOut.close();
+      dataOut.close();
+
+      BitPerBytePacker referenceBitPacker = new BitPerBytePacker();
+      // total size 777; there will be 4 blocks total.
+      // The extra 8 byte per block is the long offset for where the block starts within data bytes.
+      byte[] expectedMetadata = new byte[(testFixture.codec().getMetadataBytesLength() + 8) * 4];
+      ByteArrayDataOutput expectedMetadataOut = new ByteArrayDataOutput(expectedMetadata);
+      for (int start = 0;
+          start < testFixture.termStatesArray().length;
+          start += TermDataWriter.NUM_TERMS_PER_BLOCK) {
+        expectedMetadataOut.writeLong(referenceBitPacker.getCompactBytes().length);
+        byte[] metadata =
+            testFixture
+                .codec()
+                .encodeBlock(
+                    Arrays.copyOfRange(
+                        testFixture.termStatesArray(),
+                        start,
+                        Math.min(
+                            start + TermDataWriter.NUM_TERMS_PER_BLOCK,
+                            testFixture.termStatesArray().length)),
+                    referenceBitPacker);
+        expectedMetadataOut.writeBytes(metadata, 0, metadata.length);
+      }
+      ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes());
+      ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata);
+      TermData expected = new TermData(expectedTermType, expectedMetadataSlice, expectedDataSlice);
+
+      IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT);
+      IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT);
+      IndexInput dataIn = testDir.openInput("term_data_11", IOContext.DEFAULT);
+
+      TermData actual =
+          TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
+      assertEquals(expected.termType().getId(), actual.termType().getId());
+      assertByteSlice(expected.metadata(), actual.metadata());
+      assertByteSlice(expected.data(), actual.data());
+      testDecodeTermState(testFixture, actual);
+
+      actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
+      assertEquals(expected.termType().getId(), actual.termType().getId());
+      assertByteSlice(expected.metadata(), actual.metadata());
+      assertByteSlice(expected.data(), actual.data());
+      testDecodeTermState(testFixture, actual);
+
+      metaIn.close();
+      metadataIn.close();
+      dataIn.close();
+    }
+  }
+
+  private static void testDecodeTermState(TermStateTestFixture testFixture, TermData actual)
+      throws IOException {
+    for (int i = 0; i < testFixture.termStatesArray().length; i++) {
+      IntBlockTermState expectedTermState = testFixture.termStatesArray()[i];
+      IntBlockTermState decoded = actual.getTermState(testFixture.codec(), i);
+      assertEquals(expectedTermState.docFreq, decoded.docFreq);
+      assertEquals(expectedTermState.docStartFP, decoded.docStartFP);
+    }
+  }
+
+  private static void assertByteSlice(ByteSlice expected, ByteSlice actual) throws IOException {
+    assertEquals(expected.size(), actual.size());
+    byte[] bytesExpected = new byte[(int) expected.size()];
+    ByteArrayDataOutput out = new ByteArrayDataOutput(bytesExpected);
+    expected.writeAll(out);
+
+    byte[] bytesActual = new byte[(int) actual.size()];
+    ByteArrayDataOutput out1 = new ByteArrayDataOutput(bytesActual);
+    actual.writeAll(out1);
+    assertArrayEquals(bytesExpected, bytesActual);
+  }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
index 15a5e940986c..330017025cd6 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
@@ -40,7 +40,8 @@ public void testGetBitWidth() {
             .toArray(IntBlockTermState[]::new);
 
     byte bitWidth =
-        TermStateCodecComponent.getBitWidth(termStates, TermStateCodecComponent.DocFreq.INSTANCE);
+        TermStateCodecComponent.getBitWidth(
+            termStates, termStates.length, TermStateCodecComponent.DocFreq.INSTANCE);
     assertEquals(expectedMaxBits, bitWidth);
   }
 
@@ -68,7 +69,7 @@ public void testGetBitWidthWithIncreasingValues() {
 
     byte bitWidth =
         TermStateCodecComponent.getBitWidth(
-            termStates, TermStateCodecComponent.DocStartFP.INSTANCE);
+            termStates, termStates.length, TermStateCodecComponent.DocStartFP.INSTANCE);
     assertEquals(expectedMaxBits, bitWidth);
   }
 }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
index 175ac30e9407..a747b24a3144 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
@@ -17,6 +17,7 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 import org.apache.lucene.index.IndexOptions;
@@ -30,73 +31,60 @@
 
 public class TestTermStateCodecImpl extends LuceneTestCase {
 
-  public void testEncodeDecode() {
-    TermStateCodecImpl codec =
-        new TermStateCodecImpl(
-            new TermStateCodecComponent[] {
-              TermStateCodecComponent.DocFreq.INSTANCE, TermStateCodecComponent.DocStartFP.INSTANCE,
-            });
-
-    ArrayList<IntBlockTermState> termStates = new ArrayList<>();
-    long maxDocFreqSeen = -1;
-    long docStartFPBase = random().nextLong(Long.MAX_VALUE >> 1);
-    long maxDocStartFPDeltaSeen = -1;
-    for (int i = 0; i < random().nextInt(2, 256); i++) {
-      var termState = new IntBlockTermState();
-      termState.docFreq = random().nextInt(1, 1 << random().nextInt(1, 31));
-      if (i == 0) {
-        termState.docStartFP = docStartFPBase;
-      } else {
-        termState.docStartFP = termStates.get(i - 1).docStartFP + random().nextLong(1024);
-        maxDocStartFPDeltaSeen =
-            Math.max(maxDocStartFPDeltaSeen, termState.docStartFP - docStartFPBase);
-      }
-      maxDocFreqSeen = Math.max(maxDocFreqSeen, termState.docFreq);
-      termStates.add(termState);
-    }
-
-    IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new);
+  public void testEncodeDecode() throws IOException {
+    TermStateTestFixture result = getTermStateTestFixture(256);
 
     BitPerBytePacker bitPerBytePacker = new BitPerBytePacker();
-    byte[] metadata = codec.encodeBlock(termStatesArray, bitPerBytePacker);
+    byte[] metadata = result.codec().encodeBlock(result.termStatesArray(), bitPerBytePacker);
 
     // For the metadata, we expect
     // 0: DocFreq.bitWidth,
     // 1: DocStartFP.bitWidth,
     // [2-10]: DocStartFP.referenceValue;
-    int expectedDocFreqBitWidth = 64 - Long.numberOfLeadingZeros(maxDocFreqSeen);
-    int expectedDocStartFPBitWidth = 64 - Long.numberOfLeadingZeros(maxDocStartFPDeltaSeen);
+    int expectedDocFreqBitWidth = 64 - Long.numberOfLeadingZeros(result.maxDocFreqSeen());
+    int expectedDocStartFPBitWidth =
+        64 - Long.numberOfLeadingZeros(result.maxDocStartFPDeltaSeen());
     assertEquals(10, metadata.length);
     assertEquals(expectedDocFreqBitWidth, metadata[0]);
     assertEquals(expectedDocStartFPBitWidth, metadata[1]);
     ByteArrayDataInput byteArrayDataInput = new ByteArrayDataInput(metadata, 2, 8);
-    assertEquals(docStartFPBase, byteArrayDataInput.readLong());
+    assertEquals(result.docStartFPBase(), byteArrayDataInput.readLong());
 
     // Assert with real bit-packer we get the same bytes
     FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker =
         new FixedSizeByteArrayBitPacker(bitPerBytePacker.getCompactBytes().length);
-    codec.encodeBlock(termStatesArray, fixedSizeByteArrayBitPacker);
+    result.codec().encodeBlock(result.termStatesArray(), fixedSizeByteArrayBitPacker);
     assertArrayEquals(bitPerBytePacker.getCompactBytes(), fixedSizeByteArrayBitPacker.getBytes());
 
     // Assert that each term state is the same after the encode-decode roundtrip.
     BytesRef metadataBytes = new BytesRef(metadata);
     BytesRef dataBytes = new BytesRef(bitPerBytePacker.getBytes());
-    assertBlockRoundTrip(termStatesArray, codec, metadataBytes, dataBytes, bitPerBytePacker);
+    assertBlockRoundTrip(
+        result.termStatesArray(), result.codec(), metadataBytes, dataBytes, bitPerBytePacker);
 
     // With real compact bits instead of bit-per-byte
     dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes());
     assertBlockRoundTrip(
-        termStatesArray, codec, metadataBytes, dataBytes, BitUnpackerImpl.INSTANCE);
+        result.termStatesArray(),
+        result.codec(),
+        metadataBytes,
+        dataBytes,
+        BitUnpackerImpl.INSTANCE);
 
     // Also test decoding that doesn't begin at the start of the block.
-    int pos = random().nextInt(termStatesArray.length);
+    int pos = random().nextInt(result.termStatesArray().length);
     int startBitIndex = pos > 0 ? random().nextInt(pos) : 0;
     int recordSize = expectedDocFreqBitWidth + expectedDocStartFPBitWidth;
     // With bit-per-byte bytes
     dataBytes =
         new BytesRef(bitPerBytePacker.getBytes(), pos * recordSize - startBitIndex, recordSize);
     assertDecodeAt(
-        codec, metadataBytes, dataBytes, bitPerBytePacker, startBitIndex, termStatesArray[pos]);
+        result.codec(),
+        metadataBytes,
+        dataBytes,
+        bitPerBytePacker,
+        startBitIndex,
+        result.termStatesArray()[pos]);
 
     // With compact bytes
     int startByteIndex = pos * recordSize / 8;
@@ -104,14 +92,51 @@ public void testEncodeDecode() {
     int length = endByteIndex - startByteIndex + ((pos + 1) * recordSize % 8 == 0 ? 0 : 1);
     dataBytes = new BytesRef(bitPerBytePacker.getCompactBytes(), startByteIndex, length);
     assertDecodeAt(
-        codec,
+        result.codec(),
         metadataBytes,
         dataBytes,
         BitUnpackerImpl.INSTANCE,
         (pos * recordSize) % 8,
-        termStatesArray[pos]);
+        result.termStatesArray()[pos]);
   }
 
+  public static TermStateTestFixture getTermStateTestFixture(int size) {
+    TermStateCodecImpl codec =
+        new TermStateCodecImpl(
+            new TermStateCodecComponent[] {
+              TermStateCodecComponent.DocFreq.INSTANCE, TermStateCodecComponent.DocStartFP.INSTANCE,
+            });
+
+    ArrayList<IntBlockTermState> termStates = new ArrayList<>();
+    long maxDocFreqSeen = -1;
+    long docStartFPBase = random().nextLong(Long.MAX_VALUE >> 1);
+    long maxDocStartFPDeltaSeen = -1;
+    for (int i = 0; i < size; i++) {
+      var termState = new IntBlockTermState();
+      termState.docFreq = random().nextInt(1, 1 << random().nextInt(1, 31));
+      if (i == 0) {
+        termState.docStartFP = docStartFPBase;
+      } else {
+        termState.docStartFP = termStates.get(i - 1).docStartFP + random().nextLong(1024);
+        maxDocStartFPDeltaSeen =
+            Math.max(maxDocStartFPDeltaSeen, termState.docStartFP - docStartFPBase);
+      }
+      maxDocFreqSeen = Math.max(maxDocFreqSeen, termState.docFreq);
+      termStates.add(termState);
+    }
+
+    IntBlockTermState[] termStatesArray = termStates.toArray(IntBlockTermState[]::new);
+    return new TermStateTestFixture(
+        codec, maxDocFreqSeen, docStartFPBase, maxDocStartFPDeltaSeen, termStatesArray);
+  }
+
+  public record TermStateTestFixture(
+      TermStateCodecImpl codec,
+      long maxDocFreqSeen,
+      long docStartFPBase,
+      long maxDocStartFPDeltaSeen,
+      IntBlockTermState[] termStatesArray) {}
+
   private static void assertDecodeAt(
       TermStateCodecImpl codec,
       BytesRef metadataBytes,
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
index 2df2a74907e2..b1bf4bfa463e 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
@@ -56,6 +56,10 @@ public byte[] getBytes() {
   }
 
   public byte[] getCompactBytes() {
+    if (totalNumBits == 0) {
+      return new byte[0];
+    }
+
     int len = (totalNumBits - 1) / 8 + 1; // round up
     byte[] bytes = new byte[len];
 
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
index 84ae93fe4e52..9f50777176ac 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
@@ -17,12 +17,13 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
+import java.io.IOException;
 import java.util.Arrays;
 import org.apache.lucene.tests.util.LuceneTestCase;
 
 public class TestBitPackerImpl extends LuceneTestCase {
 
-  public void testBasic() {
+  public void testBasic() throws IOException {
     FixedSizeByteArrayBitPacker fixedSizeByteArrayBitPacker = new FixedSizeByteArrayBitPacker(5);
     for (int i = 1; i <= 10; i++) {
       fixedSizeByteArrayBitPacker.add(i, 4);
@@ -33,7 +34,7 @@ public void testBasic() {
     assertArrayEquals(expectedBytes, fixedSizeByteArrayBitPacker.getBytes());
   }
 
-  public void testRandom() {
+  public void testRandom() throws IOException {
     ValueAndBitWidth[] randomInputs = ValueAndBitWidth.getRandomArray(random(), 1000);
     int totalNumberBits = Arrays.stream(randomInputs).mapToInt(ValueAndBitWidth::bitWidth).sum();
 

From 877d1cf3cda468cc0d52655feadbc3d86dade3ea Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 16 Nov 2023 14:59:01 -0800
Subject: [PATCH 25/57] remove unneeded initialization of int to 0.

---
 .../randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
index a8be9aca89bd..b40075a1ee8d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
@@ -23,7 +23,7 @@
  */
 public final class FixedSizeByteArrayBitPacker extends BitPackerImplBase {
   private final byte[] bytes;
-  private int numBytesUsed = 0;
+  private int numBytesUsed;
 
   public FixedSizeByteArrayBitPacker(int capacity) {
     this.bytes = new byte[capacity];

From 2cdfb04a2dedcbe978371d4074e37deeb6240c0c Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 16 Nov 2023 15:27:10 -0800
Subject: [PATCH 26/57] Support serialize/deserialize for TermsStats

---
 .../lucene99/randomaccess/TermsStats.java     | 43 +++++++++++-
 .../lucene99/randomaccess/TestTermsStats.java | 66 +++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
index d5156d7455e9..af58932483c2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
@@ -17,6 +17,10 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.io.IOException;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.BytesRef;
 
 /** Data class that holds starts for term stats for a field */
@@ -26,4 +30,41 @@ record TermsStats(
     long sumDocFreq,
     int docCount,
     BytesRef minTerm,
-    BytesRef maxTerm) {}
+    BytesRef maxTerm) {
+
+  void serialize(DataOutput output) throws IOException {
+    output.writeVLong(size);
+    output.writeVLong(sumTotalTermFreq);
+    output.writeVLong(sumDocFreq);
+    output.writeVInt(docCount);
+    writeBytesRef(output, minTerm);
+    writeBytesRef(output, maxTerm);
+  }
+
+  static TermsStats deserialize(DataInput input) throws IOException {
+    return new TermsStats(
+        input.readVLong(),
+        input.readVLong(),
+        input.readVLong(),
+        input.readVInt(),
+        readBytesRef(input),
+        readBytesRef(input));
+  }
+
+  static void writeBytesRef(DataOutput output, BytesRef bytes) throws IOException {
+    output.writeVInt(bytes.length);
+    output.writeBytes(bytes.bytes, bytes.offset, bytes.length);
+  }
+
+  static BytesRef readBytesRef(DataInput input) throws IOException {
+    int numBytes = input.readVInt();
+    if (numBytes < 0) {
+      throw new CorruptIndexException("invalid bytes length: " + numBytes, input);
+    }
+
+    byte[] bytes = new byte[numBytes];
+    input.readBytes(bytes, 0, numBytes);
+
+    return new BytesRef(bytes);
+  }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
new file mode 100644
index 000000000000..b7ca5f2efbe4
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+
+
+public class TestTermsStats extends LuceneTestCase {
+
+    public void testRoundTrip() throws IOException {
+        TermsStats expected = makeRandom();
+
+        try (Directory dir = newDirectory()) {
+            IndexOutput output = dir.createOutput("terms_stats", IOContext.DEFAULT);
+            expected.serialize(output);
+            output.close();
+
+            IndexInput input = dir.openInput("terms_stats", IOContext.DEFAULT);
+            TermsStats actual = TermsStats.deserialize(input);
+
+            assertEquals(expected, actual);
+            input.close();
+        }
+    }
+
+    private TermsStats makeRandom() {
+        byte[] minBytes = getRandomBytes();
+        byte[] maxBytes = getRandomBytes();
+        return new TermsStats(
+                random().nextLong(1, Long.MAX_VALUE),
+                random().nextLong(1, Long.MAX_VALUE),
+                random().nextLong(1, Long.MAX_VALUE),
+                random().nextInt(1, Integer.MAX_VALUE),
+                new BytesRef(minBytes),
+                new BytesRef(maxBytes)
+        );
+    }
+
+    private static byte[] getRandomBytes() {
+        byte[] minBytes = new byte[random().nextInt(100)];
+        random().nextBytes(minBytes);
+        return minBytes;
+    }
+}
\ No newline at end of file

From 777c40d3bad78320f25c11c398712b3a07ea0789 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Thu, 16 Nov 2023 15:28:00 -0800
Subject: [PATCH 27/57] Explictlty mark the generic type arugment of
 FSTCompiler<Long> in TermsIndexBuilder

Some platform+jdk can't inference the type.

See: https://github.com/apache/lucene/actions/runs/6897462250/job/18765715011?pr=12688
---
 .../sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
index 9484a0505458..f552adba433c 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
@@ -35,7 +35,7 @@ final class TermsIndexBuilder {
 
   private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
   private final FSTCompiler<Long> fstCompiler =
-      new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
+      new FSTCompiler<Long>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
 
   TermsIndexBuilder() {
     Arrays.fill(countPerType, -1);

From 8a0b1ccd558227680fce1837e198cf89ce6e0c11 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Fri, 17 Nov 2023 16:42:27 -0800
Subject: [PATCH 28/57] Implement writing random-access term dictionary

* RandomAccessTermsDictWriter writes to index files
* RandomAccessTermsDict deserializes from index files and support a lookup API
---
 .../randomaccess/RandomAccessTermsDict.java   |  60 +++++-
 .../RandomAccessTermsDictWriter.java          | 182 +++++++++++++++++
 .../lucene99/randomaccess/TermData.java       |   9 +-
 .../lucene99/randomaccess/TermDataReader.java |  74 +++++++
 .../lucene99/randomaccess/TermDataWriter.java |   2 +-
 .../randomaccess/TermStateCodecImpl.java      |  19 +-
 .../lucene99/randomaccess/TermsDataStore.java |  48 -----
 .../lucene99/randomaccess/TermsImpl.java      |   2 +-
 .../lucene99/randomaccess/TermsIndex.java     |   6 +-
 .../randomaccess/TermsIndexBuilder.java       |   4 +-
 .../lucene99/randomaccess/TermsStats.java     |   3 +
 .../TestRandomAccessTermsDictWriter.java      | 184 ++++++++++++++++++
 .../randomaccess/TestTermDataWriter.java      |   6 +-
 .../randomaccess/TestTermStateCodecImpl.java  |  10 +-
 .../lucene99/randomaccess/TestTermsStats.java |  62 +++---
 15 files changed, 554 insertions(+), 117 deletions(-)
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
 create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
 delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java
 create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
index 26451dd9f938..39947f9ff78c 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
@@ -17,6 +17,64 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.io.IOException;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+
 /** A term dictionary that offer random-access to read a specific term */
 record RandomAccessTermsDict(
-    TermsStats termsStats, TermsIndex termsIndex, TermsDataStore termsDataStore) {}
+    TermsStats termsStats, TermsIndex termsIndex, TermDataReader termDataReader) {
+
+  IntBlockTermState getTermState(BytesRef term) throws IOException {
+    TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term);
+    return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord());
+  }
+
+  static RandomAccessTermsDict deserialize(
+      IndexOptionsProvider indexOptionsProvider,
+      DataInput metaInput,
+      DataInput termIndexInput,
+      TermDataInputProvider termDataInputProvider)
+      throws IOException {
+
+    // (1) deserialize field stats
+    TermsStats termsStats = TermsStats.deserialize(metaInput);
+    IndexOptions indexOptions = indexOptionsProvider.getIndexOptions(termsStats.fieldNumber());
+
+    // (2) deserialize terms index
+    TermsIndex termsIndex =
+        TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true);
+
+    // (3) deserialize all the term data by each TermType
+    // (3.1) number of unique TermType this field has
+    int numTermTypes = metaInput.readByte();
+
+    // (3.2) read per TermType
+    TermDataReader.Builder termDataReaderBuilder = new TermDataReader.Builder(indexOptions);
+    for (int i = 0; i < numTermTypes; i++) {
+      TermType termType = TermType.fromId(metaInput.readByte());
+      TermDataInput termDataInput = termDataInputProvider.getTermDataInputForType(termType);
+      termDataReaderBuilder.readOne(
+          termType, metaInput, termDataInput.metadataInput, termDataInput.dataInput);
+    }
+
+    return new RandomAccessTermsDict(termsStats, termsIndex, termDataReaderBuilder.build());
+  }
+
+  @FunctionalInterface
+  interface IndexOptionsProvider {
+
+    IndexOptions getIndexOptions(int fieldNumber);
+  }
+
+  record TermDataInput(IndexInput metadataInput, IndexInput dataInput) {}
+
+  @FunctionalInterface
+  interface TermDataInputProvider {
+
+    TermDataInput getTermDataInputForType(TermType termType) throws IOException;
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
new file mode 100644
index 000000000000..a89f6e94c6de
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+
+/** Class to write the index files for one field. */
+final class RandomAccessTermsDictWriter {
+  /** externally provided * */
+  private final int filedNumber;
+
+  private final IndexOptions indexOptions;
+  private final DataOutput metaOutput;
+
+  private final DataOutput indexOutput;
+
+  private final TermDataOutputProvider termDataOutputProvider;
+
+  /** Internal states below * */
+  private final TermDataOutput[] termDataOutputPerType =
+      new TermDataOutput[TermType.NUM_TOTAL_TYPES];
+
+  private final TermsIndexBuilder termsIndexBuilder = new TermsIndexBuilder();
+
+  private final TermDataWriter[] termDataWriterPerType =
+      new TermDataWriter[TermType.NUM_TOTAL_TYPES];
+
+  private final TermStatsTracker termStatsTracker;
+
+  private BytesRef previousTerm;
+
+  RandomAccessTermsDictWriter(
+      int filedNumber,
+      IndexOptions indexOptions,
+      DataOutput metaOutput,
+      DataOutput indexOutput,
+      TermDataOutputProvider termDataOutputProvider) {
+    this.filedNumber = filedNumber;
+    this.indexOptions = indexOptions;
+    this.metaOutput = metaOutput;
+    this.indexOutput = indexOutput;
+    this.termDataOutputProvider = termDataOutputProvider;
+    this.termStatsTracker = new TermStatsTracker(filedNumber);
+  }
+
+  void add(BytesRef term, IntBlockTermState termState) throws IOException {
+    TermType termType = TermType.fromTermState(termState);
+    if (previousTerm == null) {
+      // first term, which is also the minimum term
+      termStatsTracker.setMinTerm(term);
+    }
+    termStatsTracker.recordTerm(termState);
+    previousTerm = term;
+    termsIndexBuilder.addTerm(term, termType);
+    TermDataWriter termDataWriter = getTermDataWriterForType(termType);
+    termDataWriter.addTermState(termState);
+  }
+
+  private TermDataWriter getTermDataWriterForType(TermType termType) throws IOException {
+    if (termDataWriterPerType[termType.getId()] != null) {
+      return termDataWriterPerType[termType.getId()];
+    }
+
+    TermDataOutput termDataOutput = getTermDataOutput(termType);
+    TermDataWriter termDataWriter =
+        new TermDataWriter(
+            TermStateCodecImpl.getCodec(termType, indexOptions),
+            termDataOutput.metadataOutput(),
+            termDataOutput.dataOutput());
+    termDataWriterPerType[termType.getId()] = termDataWriter;
+    return termDataWriter;
+  }
+
+  private TermDataOutput getTermDataOutput(TermType termType) throws IOException {
+    if (termDataOutputPerType[termType.getId()] == null) {
+      termDataOutputPerType[termType.getId()] =
+          termDataOutputProvider.getTermDataOutputForType(termType);
+    }
+    return termDataOutputPerType[termType.getId()];
+  }
+
+  void finish(int docCount) throws IOException {
+    // finish up TermsStats for this field
+    termStatsTracker.setMaxTerm(previousTerm);
+    termStatsTracker.setDocCount(docCount);
+    TermsStats termsStats = termStatsTracker.finish();
+    // (1) Write field metadata
+    termsStats.serialize(metaOutput);
+
+    // (2) serialize the term index
+    termsIndexBuilder.build().serialize(metaOutput, indexOutput);
+
+    // (3) serialize information needed to decode per-TermType TermData
+    // (3.1) number of unique TermTypes this field has
+    int numTermTypesSeen = 0;
+    for (var termDataWriter : termDataWriterPerType) {
+      if (termDataWriter != null) {
+        numTermTypesSeen += 1;
+      }
+    }
+    metaOutput.writeByte((byte) numTermTypesSeen);
+
+    // (3.2) (termType, metadataLength, dataLength) for each TermData
+    for (int i = 0; i < termDataWriterPerType.length; i++) {
+      var termDataWriter = termDataWriterPerType[i];
+      if (termDataWriter != null) {
+        termDataWriter.finish();
+        metaOutput.writeByte((byte) i);
+        metaOutput.writeVLong(termDataWriter.getTotalMetaDataBytesWritten());
+        metaOutput.writeVLong(termDataWriter.getTotalDataBytesWritten());
+      }
+    }
+  }
+
+  record TermDataOutput(IndexOutput metadataOutput, IndexOutput dataOutput) {}
+
+  @FunctionalInterface
+  static interface TermDataOutputProvider {
+
+    TermDataOutput getTermDataOutputForType(TermType termType) throws IOException;
+  }
+
+  static final class TermStatsTracker {
+    final int fieldNumber;
+    long size;
+    long sumTotalTermFreq;
+    long sumDocFreq;
+    int docCount;
+    BytesRef minTerm;
+    BytesRef maxTerm;
+
+    TermStatsTracker(int fieldNumber) {
+      this.fieldNumber = fieldNumber;
+    }
+
+    void recordTerm(IntBlockTermState termState) {
+      size += 1;
+      sumDocFreq += termState.docFreq;
+      sumTotalTermFreq += termState.totalTermFreq;
+    }
+
+    void setDocCount(int docCount) {
+      this.docCount = docCount;
+    }
+
+    void setMinTerm(BytesRef minTerm) {
+      this.minTerm = minTerm;
+    }
+
+    void setMaxTerm(BytesRef maxTerm) {
+      this.maxTerm = maxTerm;
+    }
+
+    TermsStats finish() {
+      assert docCount > 0 && minTerm != null && maxTerm != null;
+
+      return new TermsStats(
+          fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm);
+    }
+  }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
index 4e8f79738e59..9c74ffc83835 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
@@ -29,7 +29,7 @@
  * Holds the bit-packed {@link IntBlockTermState} for a given {@link
  * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType}
  */
-record TermData(TermType termType, ByteSlice metadata, ByteSlice data) {
+record TermData(ByteSlice metadata, ByteSlice data) {
 
   IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException {
     long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK;
@@ -53,7 +53,6 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio
 
   static TermData deserializeOnHeap(
       DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException {
-    TermType termType = TermType.fromId(metaInput.readByte());
     long metadataSize = metaInput.readVLong();
     long dataSize = metaInput.readVLong();
 
@@ -72,13 +71,11 @@ static TermData deserializeOnHeap(
     metadataInput.readBytes(metadataBytes, 0, metadataBytes.length);
     dataInput.readBytes(dataBytes, 0, dataBytes.length);
 
-    return new TermData(
-        termType, new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes));
+    return new TermData(new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes));
   }
 
   static TermData deserializeOffHeap(
       DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException {
-    TermType termType = TermType.fromId(metaInput.readByte());
     long metadataSize = metaInput.readVLong();
     long dataSize = metaInput.readVLong();
 
@@ -89,6 +86,6 @@ static TermData deserializeOffHeap(
     dataInput.skipBytes(dataSize);
 
     return new TermData(
-        termType, new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data));
+        new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data));
   }
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
new file mode 100644
index 000000000000..3a7ebd1e8a7c
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+
+/**
+ * Holds all {@link TermData} per {@link TermType} for a field. Also manages the proper codec needed
+ * per TermType.
+ */
+record TermDataReader(TermDataAndCodec[] termDataAndCodecs) {
+
+  IntBlockTermState getTermState(TermType termType, long ord) throws IOException {
+    assert termDataAndCodecs[termType.getId()] != null;
+    var dataAndCodec = termDataAndCodecs[termType.getId()];
+    IntBlockTermState termState = dataAndCodec.termData.getTermState(dataAndCodec.codec, ord);
+
+    // need to filling some default values for the term state
+    // in order to meet the expectations of the postings reader
+    if (termType.hasSingletonDoc()) {
+      termState.docFreq = 1;
+    }
+    if (termType.hasSkipData() == false) {
+      termState.skipOffset = -1;
+    }
+    if (termType.hasLastPositionBlockOffset() == false) {
+      termState.lastPosBlockOffset = -1;
+    }
+
+    return termState;
+  }
+
+  static class Builder {
+    final IndexOptions indexOptions;
+    final TermDataAndCodec[] termDataAndCodecs = new TermDataAndCodec[TermType.NUM_TOTAL_TYPES];
+
+    Builder(IndexOptions indexOptions) {
+      this.indexOptions = indexOptions;
+    }
+
+    void readOne(
+        TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn)
+        throws IOException {
+      TermData termData = TermData.deserializeOffHeap(metaIn, termMetadataIn, termDataIn);
+      TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions);
+      termDataAndCodecs[termType.getId()] = new TermDataAndCodec(termData, codec);
+    }
+
+    TermDataReader build() {
+      return new TermDataReader(termDataAndCodecs);
+    }
+  }
+
+  record TermDataAndCodec(TermData termData, TermStateCodec codec) {}
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
index 09ab3cba9242..d69c45de9abc 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
@@ -22,7 +22,7 @@
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.DataOutputBitPacker;
 import org.apache.lucene.store.DataOutput;
 
-/** Writes TermData to */
+/** Writes TermData to two separate {@link DataOutput} one for metadata, another for term data */
 final class TermDataWriter {
   static final int NUM_TERMS_PER_BLOCK = 256;
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index 3dc0a69f0c05..734e24a7a057 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -73,31 +73,24 @@ public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexO
     assert !(termType.hasSkipData() && termType.hasSingletonDoc());
 
     ArrayList<TermStateCodecComponent> components = new ArrayList<>();
-    // handle docs
+    // handle docs and docFreq
     if (termType.hasSingletonDoc()) {
       components.add(SingletonDocId.INSTANCE);
     } else {
       components.add(DocStartFP.INSTANCE);
+      components.add(DocFreq.INSTANCE);
     }
     // handle skip data
     if (termType.hasSkipData()) {
       components.add(SkipOffset.INSTANCE);
     }
-    // handle docFreq
-    boolean totalTermFeqAdded = false;
+
+    // handle freq
     if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
-      if (termType.hasSingletonDoc()) {
-        components.add(TotalTermFreq.INSTANCE);
-        totalTermFeqAdded = true;
-      } else {
-        components.add(DocFreq.INSTANCE);
-      }
+      components.add(TotalTermFreq.INSTANCE);
     }
     // handle positions
     if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
-      if (!totalTermFeqAdded) {
-        components.add(TotalTermFreq.INSTANCE);
-      }
       components.add(PositionStartFP.INSTANCE);
       if (termType.hasLastPositionBlockOffset()) {
         components.add(LastPositionBlockOffset.INSTANCE);
@@ -145,7 +138,7 @@ public byte[] encodeBlockUpTo(IntBlockTermState[] inputs, int uptop, BitPacker b
   }
 
   private Metadata[] getMetadataPerComponent(IntBlockTermState[] inputs, int upTo) {
-    Metadata[] metadataPerComponent = new Metadata[upTo];
+    Metadata[] metadataPerComponent = new Metadata[components.length];
     for (int i = 0; i < components.length; i++) {
       var component = components[i];
       byte bitWidth = TermStateCodecComponent.getBitWidth(inputs, upTo, component);
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java
deleted file mode 100644
index 1717d26aa780..000000000000
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsDataStore.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
-
-import java.util.Arrays;
-
-/** Holds all {@link TermData} for all {@link TermType} for a field. */
-class TermsDataStore {
-  private final TermData[] dataPerTermType;
-
-  private TermsDataStore(TermData[] dataPerTermType) {
-    this.dataPerTermType = dataPerTermType;
-  }
-
-  static class Builder {
-    private final TermData[] dataPerTermType;
-
-    Builder() {
-      dataPerTermType = new TermData[TermType.NUM_TOTAL_TYPES];
-      Arrays.fill(dataPerTermType, null);
-    }
-
-    void add(TermData termData) {
-      assert dataPerTermType[termData.termType().getId()] == null;
-
-      dataPerTermType[termData.termType().getId()] = termData;
-    }
-
-    TermsDataStore build() {
-      return new TermsDataStore(dataPerTermType);
-    }
-  }
-}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index 29a9c4124e7b..edbf1141457f 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -25,7 +25,7 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 
-class TermsImpl extends Terms {
+final class TermsImpl extends Terms {
   private final FieldInfo fieldInfo;
 
   private final RandomAccessTermsDict termsDict;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index 917989b51b43..ce53493b8522 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -35,13 +35,13 @@ TypeAndOrd getTerm(BytesRef term) throws IOException {
     return new TypeAndOrd(termType, ord);
   }
 
-  public record TypeAndOrd(TermType termType, long ord) {}
+  record TypeAndOrd(TermType termType, long ord) {}
 
-  public void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException {
+  void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException {
     fst.save(metaOut, dataOut);
   }
 
-  public TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap)
+  static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap)
       throws IOException {
     FST<Long> fst;
     if (loadOffHeap) {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
index f552adba433c..824803847b2d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
@@ -31,11 +31,11 @@
  * ordinals are scoped to type (not global).
  */
 final class TermsIndexBuilder {
-  private static long MAX_ORD = (1L << 60) - 1;
+  private static final long MAX_ORD = (1L << 60) - 1;
 
   private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
   private final FSTCompiler<Long> fstCompiler =
-      new FSTCompiler<Long>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
+      new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
 
   TermsIndexBuilder() {
     Arrays.fill(countPerType, -1);
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
index af58932483c2..0c65f2e04d39 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
@@ -25,6 +25,7 @@
 
 /** Data class that holds starts for term stats for a field */
 record TermsStats(
+    int fieldNumber,
     long size,
     long sumTotalTermFreq,
     long sumDocFreq,
@@ -33,6 +34,7 @@ record TermsStats(
     BytesRef maxTerm) {
 
   void serialize(DataOutput output) throws IOException {
+    output.writeVInt(fieldNumber);
     output.writeVLong(size);
     output.writeVLong(sumTotalTermFreq);
     output.writeVLong(sumDocFreq);
@@ -43,6 +45,7 @@ void serialize(DataOutput output) throws IOException {
 
   static TermsStats deserialize(DataInput input) throws IOException {
     return new TermsStats(
+        input.readVInt(),
         input.readVLong(),
         input.readVLong(),
         input.readVLong(),
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
new file mode 100644
index 000000000000..179175f7398f
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDict.TermDataInput;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDict.TermDataInputProvider;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutput;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutputProvider;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BitUtil;
+import org.apache.lucene.util.BytesRef;
+
+public class TestRandomAccessTermsDictWriter extends LuceneTestCase {
+
+  public void testBuildIndexAndRead() throws IOException {
+    try (Directory testDir = newDirectory()) {
+      IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT);
+      IndexOutput termIndexOut = testDir.createOutput("term_index", IOContext.DEFAULT);
+      HashMap<TermType, TermDataOutput> termDataOutputsMap = new HashMap<>();
+      TermDataOutputProvider outputProvider =
+          termType ->
+              termDataOutputsMap.computeIfAbsent(
+                  termType,
+                  t -> {
+                    try {
+                      return new TermDataOutput(
+                          testDir.createOutput("term_meta_" + t.getId(), IOContext.DEFAULT),
+                          testDir.createOutput("term_data_" + t.getId(), IOContext.DEFAULT));
+                    } catch (IOException e) {
+                      throw new RuntimeException(e);
+                    }
+                  });
+
+      int fieldNumber = random().nextInt(0, 10);
+      IndexOptions indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
+      RandomAccessTermsDictWriter randomAccessTermsDictWriter =
+          new RandomAccessTermsDictWriter(
+              fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider);
+
+      TermAndState[] expectedTermAndState = getRandoms(1000, 2000);
+      int expectedDocCount = random().nextInt(1, 2000);
+
+      for (var x : expectedTermAndState) {
+        randomAccessTermsDictWriter.add(x.term, x.state);
+      }
+      randomAccessTermsDictWriter.finish(expectedDocCount);
+
+      metaOut.close();
+      termIndexOut.close();
+      for (var e : termDataOutputsMap.values()) {
+        e.dataOutput().close();
+        e.metadataOutput().close();
+      }
+
+      IndexInput metaInput = testDir.openInput("segment_meta", IOContext.READ);
+      IndexInput termIndexInput = testDir.openInput("term_index", IOContext.LOAD);
+      HashMap<TermType, TermDataInput> termDataInputsMap = new HashMap<>();
+      TermDataInputProvider termDataInputProvider =
+          termType ->
+              termDataInputsMap.computeIfAbsent(
+                  termType,
+                  t -> {
+                    try {
+                      return new TermDataInput(
+                          testDir.openInput("term_meta_" + t.getId(), IOContext.LOAD),
+                          testDir.openInput("term_data_" + t.getId(), IOContext.LOAD));
+                    } catch (IOException e) {
+                      throw new RuntimeException(e);
+                    }
+                  });
+      RandomAccessTermsDict deserialized =
+          RandomAccessTermsDict.deserialize(
+              _fieldNumber -> indexOptions, metaInput, termIndexInput, termDataInputProvider);
+
+      assertEquals(fieldNumber, deserialized.termsStats().fieldNumber());
+      assertEquals(expectedDocCount, deserialized.termsStats().docCount());
+      assertEquals(expectedTermAndState.length, deserialized.termsStats().size());
+      assertEquals(
+              Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(),
+              deserialized.termsStats().sumDocFreq());
+      assertEquals(
+              Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(),
+              deserialized.termsStats().sumTotalTermFreq());
+      assertEquals(expectedTermAndState.length, deserialized.termsStats().size());
+      assertEquals(expectedTermAndState[0].term, deserialized.termsStats().minTerm());
+      assertEquals(expectedTermAndState[expectedTermAndState.length - 1].term, deserialized.termsStats().maxTerm());
+
+      for (var x : expectedTermAndState) {
+        IntBlockTermState expectedState = x.state;
+        IntBlockTermState actualState = deserialized.getTermState(x.term);
+        if (expectedState.singletonDocID != -1) {
+          assertEquals(expectedState.singletonDocID, actualState.singletonDocID);
+        } else {
+          assertEquals(expectedState.docStartFP, actualState.docStartFP);
+        }
+        assertEquals(expectedState.docFreq, actualState.docFreq);
+        assertEquals(expectedState.totalTermFreq, actualState.totalTermFreq);
+        assertEquals(expectedState.skipOffset, actualState.skipOffset);
+        assertEquals(expectedState.posStartFP, actualState.posStartFP);
+        assertEquals(expectedState.payStartFP, actualState.payStartFP);
+        assertEquals(expectedState.lastPosBlockOffset, actualState.lastPosBlockOffset);
+      }
+
+      metaInput.close();
+      termIndexInput.close();
+      for (var e : termDataInputsMap.values()) {
+        e.metadataInput().close();
+        e.dataInput().close();
+      }
+    }
+  }
+
+  TermAndState[] getRandoms(int size, int maxDoc) {
+    IntBlockTermState lastTermState = null;
+
+    ArrayList<TermAndState> result = new ArrayList<>(size);
+    for (int i = 0; i < size; i++) {
+      byte[] termBytes = new byte[4];
+      BitUtil.VH_BE_INT.set(termBytes, 0, i);
+
+      IntBlockTermState termState = new IntBlockTermState();
+      termState.docFreq = random().nextInt(1, 100);
+      if (termState.docFreq == 1) {
+        termState.singletonDocID = random().nextInt(0, maxDoc);
+      } else {
+        termState.singletonDocID = -1;
+      }
+      if (lastTermState == null) {
+        termState.docStartFP = 0;
+        termState.posStartFP = 0;
+        termState.payStartFP = 0;
+      } else {
+        termState.docStartFP = lastTermState.docStartFP;
+        termState.posStartFP = lastTermState.posStartFP;
+        termState.payStartFP = lastTermState.payStartFP;
+        termState.docStartFP += termState.docFreq == 1 ? 0 : random().nextLong(1, 256);
+        termState.posStartFP += random().nextLong(1, 256);
+        termState.payStartFP += random().nextLong(1, 256);
+      }
+      termState.totalTermFreq = random().nextLong(termState.docFreq, 1000);
+      if (termState.docFreq > 1 && random().nextBoolean()) {
+        termState.skipOffset = random().nextLong(1, 256);
+      } else {
+        termState.skipOffset = -1;
+      }
+      if (random().nextBoolean()) {
+        termState.lastPosBlockOffset = random().nextLong(1, 256);
+      } else {
+        termState.lastPosBlockOffset = -1;
+      }
+      lastTermState = termState;
+      result.add(new TermAndState(new BytesRef(termBytes), termState));
+    }
+
+    return result.toArray(TermAndState[]::new);
+  }
+
+  record TermAndState(BytesRef term, IntBlockTermState state) {}
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
index aab496a41937..2ddc5b4ee67f 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
@@ -33,7 +33,6 @@ public class TestTermDataWriter extends LuceneTestCase {
 
   public void testWriterAndDeserialize() throws IOException {
     TermStateTestFixture testFixture = TestTermStateCodecImpl.getTermStateTestFixture(777);
-    TermType expectedTermType = TermType.fromId(7);
 
     try (Directory testDir = newDirectory()) {
       IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT);
@@ -44,7 +43,6 @@ public void testWriterAndDeserialize() throws IOException {
         writer.addTermState(termState);
       }
       writer.finish();
-      metaOut.writeByte((byte) expectedTermType.getId());
       metaOut.writeVLong(writer.getTotalMetaDataBytesWritten());
       metaOut.writeVLong(writer.getTotalDataBytesWritten());
       metaOut.close();
@@ -75,7 +73,7 @@ public void testWriterAndDeserialize() throws IOException {
       }
       ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes());
       ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata);
-      TermData expected = new TermData(expectedTermType, expectedMetadataSlice, expectedDataSlice);
+      TermData expected = new TermData(expectedMetadataSlice, expectedDataSlice);
 
       IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT);
       IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT);
@@ -83,13 +81,11 @@ public void testWriterAndDeserialize() throws IOException {
 
       TermData actual =
           TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
-      assertEquals(expected.termType().getId(), actual.termType().getId());
       assertByteSlice(expected.metadata(), actual.metadata());
       assertByteSlice(expected.data(), actual.data());
       testDecodeTermState(testFixture, actual);
 
       actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
-      assertEquals(expected.termType().getId(), actual.termType().getId());
       assertByteSlice(expected.metadata(), actual.metadata());
       assertByteSlice(expected.data(), actual.data());
       testDecodeTermState(testFixture, actual);
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
index a747b24a3144..db7630f1f35a 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
@@ -195,11 +195,11 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
             && !termType.hasSkipData()
             && !termType.hasSingletonDoc();
         components.add(TermStateCodecComponent.DocStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.DocFreq.INSTANCE);
         if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
-          components.add(TermStateCodecComponent.DocFreq.INSTANCE);
+          components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
         }
         if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
-          components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
           components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
         }
         if (indexOptions.ordinal()
@@ -233,12 +233,12 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
             && termType.hasSkipData()
             && !termType.hasSingletonDoc();
         components.add(TermStateCodecComponent.DocStartFP.INSTANCE);
+        components.add(TermStateCodecComponent.DocFreq.INSTANCE);
         components.add(TermStateCodecComponent.SkipOffset.INSTANCE);
         if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
-          components.add(TermStateCodecComponent.DocFreq.INSTANCE);
+          components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
         }
         if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
-          components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
           components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
         }
         if (indexOptions.ordinal()
@@ -290,8 +290,8 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
             && !termType.hasSingletonDoc();
         assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal();
         components.add(TermStateCodecComponent.DocStartFP.INSTANCE);
-        components.add(TermStateCodecComponent.SkipOffset.INSTANCE);
         components.add(TermStateCodecComponent.DocFreq.INSTANCE);
+        components.add(TermStateCodecComponent.SkipOffset.INSTANCE);
         components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
         components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
         components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE);
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
index b7ca5f2efbe4..8937c5f9e319 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
@@ -17,6 +17,7 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import java.io.IOException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
@@ -24,43 +25,40 @@
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
 
-import java.io.IOException;
-
-
 public class TestTermsStats extends LuceneTestCase {
 
-    public void testRoundTrip() throws IOException {
-        TermsStats expected = makeRandom();
+  public void testRoundTrip() throws IOException {
+    TermsStats expected = makeRandom();
 
-        try (Directory dir = newDirectory()) {
-            IndexOutput output = dir.createOutput("terms_stats", IOContext.DEFAULT);
-            expected.serialize(output);
-            output.close();
+    try (Directory dir = newDirectory()) {
+      IndexOutput output = dir.createOutput("terms_stats", IOContext.DEFAULT);
+      expected.serialize(output);
+      output.close();
 
-            IndexInput input = dir.openInput("terms_stats", IOContext.DEFAULT);
-            TermsStats actual = TermsStats.deserialize(input);
+      IndexInput input = dir.openInput("terms_stats", IOContext.DEFAULT);
+      TermsStats actual = TermsStats.deserialize(input);
 
-            assertEquals(expected, actual);
-            input.close();
-        }
+      assertEquals(expected, actual);
+      input.close();
     }
+  }
 
-    private TermsStats makeRandom() {
-        byte[] minBytes = getRandomBytes();
-        byte[] maxBytes = getRandomBytes();
-        return new TermsStats(
-                random().nextLong(1, Long.MAX_VALUE),
-                random().nextLong(1, Long.MAX_VALUE),
-                random().nextLong(1, Long.MAX_VALUE),
-                random().nextInt(1, Integer.MAX_VALUE),
-                new BytesRef(minBytes),
-                new BytesRef(maxBytes)
-        );
-    }
+  private TermsStats makeRandom() {
+    byte[] minBytes = getRandomBytes();
+    byte[] maxBytes = getRandomBytes();
+    return new TermsStats(
+        random().nextInt(1, Integer.MAX_VALUE),
+        random().nextLong(1, Long.MAX_VALUE),
+        random().nextLong(1, Long.MAX_VALUE),
+        random().nextLong(1, Long.MAX_VALUE),
+        random().nextInt(1, Integer.MAX_VALUE),
+        new BytesRef(minBytes),
+        new BytesRef(maxBytes));
+  }
 
-    private static byte[] getRandomBytes() {
-        byte[] minBytes = new byte[random().nextInt(100)];
-        random().nextBytes(minBytes);
-        return minBytes;
-    }
-}
\ No newline at end of file
+  private static byte[] getRandomBytes() {
+    byte[] minBytes = new byte[random().nextInt(100)];
+    random().nextBytes(minBytes);
+    return minBytes;
+  }
+}

From 83968309e767a039b33b0b6742bbaa8fda089cd0 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Fri, 17 Nov 2023 16:55:34 -0800
Subject: [PATCH 29/57] Fix build after mering from apahce:main

---
 .../randomaccess/RandomAccessTermsDictWriter.java    |  1 -
 .../lucene99/randomaccess/TermsIndexBuilder.java     |  2 +-
 .../TestRandomAccessTermsDictWriter.java             | 12 +++++++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
index a89f6e94c6de..fab30774c665 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
@@ -18,7 +18,6 @@
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
-import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.store.DataOutput;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
index 824803847b2d..d142420d4470 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
@@ -35,7 +35,7 @@ final class TermsIndexBuilder {
 
   private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
   private final FSTCompiler<Long> fstCompiler =
-      new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
+      new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build();
 
   TermsIndexBuilder() {
     Arrays.fill(countPerType, -1);
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
index 179175f7398f..b9d2a444894d 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
@@ -101,14 +101,16 @@ public void testBuildIndexAndRead() throws IOException {
       assertEquals(expectedDocCount, deserialized.termsStats().docCount());
       assertEquals(expectedTermAndState.length, deserialized.termsStats().size());
       assertEquals(
-              Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(),
-              deserialized.termsStats().sumDocFreq());
+          Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(),
+          deserialized.termsStats().sumDocFreq());
       assertEquals(
-              Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(),
-              deserialized.termsStats().sumTotalTermFreq());
+          Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(),
+          deserialized.termsStats().sumTotalTermFreq());
       assertEquals(expectedTermAndState.length, deserialized.termsStats().size());
       assertEquals(expectedTermAndState[0].term, deserialized.termsStats().minTerm());
-      assertEquals(expectedTermAndState[expectedTermAndState.length - 1].term, deserialized.termsStats().maxTerm());
+      assertEquals(
+          expectedTermAndState[expectedTermAndState.length - 1].term,
+          deserialized.termsStats().maxTerm());
 
       for (var x : expectedTermAndState) {
         IntBlockTermState expectedState = x.state;

From 96d6e3320b290aa654ea5dec53c22c51c25ca894 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sat, 18 Nov 2023 16:03:26 -0800
Subject: [PATCH 30/57] Test serailize/deserialize multiple fields' term
 dictionary

Also fix a bug in loading term index FST offheap.
---
 .../lucene99/randomaccess/TermsIndex.java     |   4 +-
 .../TestRandomAccessTermsDictWriter.java      | 135 ++++++++++++------
 2 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index ce53493b8522..d0a4c0c4c56b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -45,7 +45,9 @@ static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOf
       throws IOException {
     FST<Long> fst;
     if (loadOffHeap) {
-      fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), new OffHeapFSTStore());
+      var fstStore = new OffHeapFSTStore();
+      fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), fstStore);
+      dataIn.skipBytes(fstStore.size());
     } else {
       fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton());
     }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
index b9d2a444894d..d4b1f94aab04 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
@@ -36,8 +36,9 @@
 import org.apache.lucene.util.BytesRef;
 
 public class TestRandomAccessTermsDictWriter extends LuceneTestCase {
+  int nextFieldNumber;
 
-  public void testBuildIndexAndRead() throws IOException {
+  public void testBuildIndexAndReadMultipleFields() throws IOException {
     try (Directory testDir = newDirectory()) {
       IndexOutput metaOut = testDir.createOutput("segment_meta", IOContext.DEFAULT);
       IndexOutput termIndexOut = testDir.createOutput("term_index", IOContext.DEFAULT);
@@ -56,19 +57,10 @@ public void testBuildIndexAndRead() throws IOException {
                     }
                   });
 
-      int fieldNumber = random().nextInt(0, 10);
-      IndexOptions indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
-      RandomAccessTermsDictWriter randomAccessTermsDictWriter =
-          new RandomAccessTermsDictWriter(
-              fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider);
-
-      TermAndState[] expectedTermAndState = getRandoms(1000, 2000);
-      int expectedDocCount = random().nextInt(1, 2000);
-
-      for (var x : expectedTermAndState) {
-        randomAccessTermsDictWriter.add(x.term, x.state);
+      ExpectedResults[] manyExpectedResults = new ExpectedResults[random().nextInt(1, 20)];
+      for (int i = 0; i < manyExpectedResults.length; i++) {
+        manyExpectedResults[i] = indexOneField(metaOut, termIndexOut, outputProvider);
       }
-      randomAccessTermsDictWriter.finish(expectedDocCount);
 
       metaOut.close();
       termIndexOut.close();
@@ -93,39 +85,10 @@ public void testBuildIndexAndRead() throws IOException {
                       throw new RuntimeException(e);
                     }
                   });
-      RandomAccessTermsDict deserialized =
-          RandomAccessTermsDict.deserialize(
-              _fieldNumber -> indexOptions, metaInput, termIndexInput, termDataInputProvider);
-
-      assertEquals(fieldNumber, deserialized.termsStats().fieldNumber());
-      assertEquals(expectedDocCount, deserialized.termsStats().docCount());
-      assertEquals(expectedTermAndState.length, deserialized.termsStats().size());
-      assertEquals(
-          Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.docFreq).sum(),
-          deserialized.termsStats().sumDocFreq());
-      assertEquals(
-          Arrays.stream(expectedTermAndState).mapToLong(x -> x.state.totalTermFreq).sum(),
-          deserialized.termsStats().sumTotalTermFreq());
-      assertEquals(expectedTermAndState.length, deserialized.termsStats().size());
-      assertEquals(expectedTermAndState[0].term, deserialized.termsStats().minTerm());
-      assertEquals(
-          expectedTermAndState[expectedTermAndState.length - 1].term,
-          deserialized.termsStats().maxTerm());
-
-      for (var x : expectedTermAndState) {
-        IntBlockTermState expectedState = x.state;
-        IntBlockTermState actualState = deserialized.getTermState(x.term);
-        if (expectedState.singletonDocID != -1) {
-          assertEquals(expectedState.singletonDocID, actualState.singletonDocID);
-        } else {
-          assertEquals(expectedState.docStartFP, actualState.docStartFP);
-        }
-        assertEquals(expectedState.docFreq, actualState.docFreq);
-        assertEquals(expectedState.totalTermFreq, actualState.totalTermFreq);
-        assertEquals(expectedState.skipOffset, actualState.skipOffset);
-        assertEquals(expectedState.posStartFP, actualState.posStartFP);
-        assertEquals(expectedState.payStartFP, actualState.payStartFP);
-        assertEquals(expectedState.lastPosBlockOffset, actualState.lastPosBlockOffset);
+
+      for (var expectedResult : manyExpectedResults) {
+        assertDeserializedMatchingExpected(
+            expectedResult, metaInput, termIndexInput, termDataInputProvider);
       }
 
       metaInput.close();
@@ -137,7 +100,85 @@ public void testBuildIndexAndRead() throws IOException {
     }
   }
 
-  TermAndState[] getRandoms(int size, int maxDoc) {
+  private static void assertDeserializedMatchingExpected(
+      ExpectedResults result,
+      IndexInput metaInput,
+      IndexInput termIndexInput,
+      TermDataInputProvider termDataInputProvider)
+      throws IOException {
+    RandomAccessTermsDict deserialized =
+        RandomAccessTermsDict.deserialize(
+            _fieldNumber -> result.indexOptions(),
+            metaInput,
+            termIndexInput,
+            termDataInputProvider);
+
+    assertEquals(result.fieldNumber(), deserialized.termsStats().fieldNumber());
+    assertEquals(result.expectedDocCount(), deserialized.termsStats().docCount());
+    assertEquals(result.expectedTermAndState().length, deserialized.termsStats().size());
+    assertEquals(
+        Arrays.stream(result.expectedTermAndState()).mapToLong(x -> x.state.docFreq).sum(),
+        deserialized.termsStats().sumDocFreq());
+    assertEquals(
+        Arrays.stream(result.expectedTermAndState()).mapToLong(x -> x.state.totalTermFreq).sum(),
+        deserialized.termsStats().sumTotalTermFreq());
+    assertEquals(result.expectedTermAndState().length, deserialized.termsStats().size());
+    assertEquals(result.expectedTermAndState()[0].term, deserialized.termsStats().minTerm());
+    assertEquals(
+        result.expectedTermAndState()[result.expectedTermAndState().length - 1].term,
+        deserialized.termsStats().maxTerm());
+
+    for (var x : result.expectedTermAndState()) {
+      IntBlockTermState expectedState = x.state;
+      IntBlockTermState actualState = deserialized.getTermState(x.term);
+      if (expectedState.singletonDocID != -1) {
+        assertEquals(expectedState.singletonDocID, actualState.singletonDocID);
+      } else {
+        assertEquals(expectedState.docStartFP, actualState.docStartFP);
+      }
+      assertEquals(expectedState.docFreq, actualState.docFreq);
+      if (result.indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS.ordinal()) {
+        assertEquals(expectedState.totalTermFreq, actualState.totalTermFreq);
+      }
+      assertEquals(expectedState.skipOffset, actualState.skipOffset);
+      if (result.indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+        assertEquals(expectedState.posStartFP, actualState.posStartFP);
+        assertEquals(expectedState.lastPosBlockOffset, actualState.lastPosBlockOffset);
+      }
+      if (result.indexOptions.ordinal()
+          >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
+        assertEquals(expectedState.payStartFP, actualState.payStartFP);
+      }
+    }
+  }
+
+  private ExpectedResults indexOneField(
+      IndexOutput metaOut, IndexOutput termIndexOut, TermDataOutputProvider outputProvider)
+      throws IOException {
+    int fieldNumber = nextFieldNumber++;
+    IndexOptions indexOptions =
+        IndexOptions.values()[random().nextInt(1, IndexOptions.values().length)];
+    RandomAccessTermsDictWriter randomAccessTermsDictWriter =
+        new RandomAccessTermsDictWriter(
+            fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider);
+
+    TermAndState[] expectedTermAndState = getRandoms(1000, 2000);
+    int expectedDocCount = random().nextInt(1, 2000);
+
+    for (var x : expectedTermAndState) {
+      randomAccessTermsDictWriter.add(x.term, x.state);
+    }
+    randomAccessTermsDictWriter.finish(expectedDocCount);
+    return new ExpectedResults(fieldNumber, indexOptions, expectedTermAndState, expectedDocCount);
+  }
+
+  private record ExpectedResults(
+      int fieldNumber,
+      IndexOptions indexOptions,
+      TermAndState[] expectedTermAndState,
+      int expectedDocCount) {}
+
+  static TermAndState[] getRandoms(int size, int maxDoc) {
     IntBlockTermState lastTermState = null;
 
     ArrayList<TermAndState> result = new ArrayList<>(size);

From 622e56fe72c6a1eef6b11d6fb89ee62c33fbe98d Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 20 Nov 2023 10:37:25 -0800
Subject: [PATCH 31/57] Remove unused member in RandomAccessTermsDictWriter

---
 .../lucene99/randomaccess/RandomAccessTermsDictWriter.java     | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
index fab30774c665..030d144e60d2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
@@ -27,8 +27,6 @@
 /** Class to write the index files for one field. */
 final class RandomAccessTermsDictWriter {
   /** externally provided * */
-  private final int filedNumber;
-
   private final IndexOptions indexOptions;
   private final DataOutput metaOutput;
 
@@ -55,7 +53,6 @@ final class RandomAccessTermsDictWriter {
       DataOutput metaOutput,
       DataOutput indexOutput,
       TermDataOutputProvider termDataOutputProvider) {
-    this.filedNumber = filedNumber;
     this.indexOptions = indexOptions;
     this.metaOutput = metaOutput;
     this.indexOutput = indexOutput;

From cf1104d675e3749614d0396b8b165c32ba9a064b Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 20 Nov 2023 14:38:25 -0800
Subject: [PATCH 32/57] Implement Lucene99RandomAccessTermsReader/Writer

---
 ...9RandomAccessDictionaryPostingsFormat.java |  21 +-
 .../Lucene99RandomAccessTermsReader.java      | 170 ++++++++++++++-
 .../Lucene99RandomAccessTermsWriter.java      | 202 +++++++++++++++++-
 .../RandomAccessTermsDictWriter.java          |   1 +
 .../lucene99/randomaccess/TermsImpl.java      |   2 +-
 .../randomaccess/TestTermDataWriter.java      |   4 +-
 6 files changed, 381 insertions(+), 19 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
index 59de10be73da..4b616486cad0 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
@@ -20,8 +20,6 @@
 import org.apache.lucene.codecs.FieldsConsumer;
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.PostingsReaderBase;
-import org.apache.lucene.codecs.PostingsWriterBase;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
@@ -35,6 +33,15 @@
  * @lucene.experimental
  */
 public final class Lucene99RandomAccessDictionaryPostingsFormat extends PostingsFormat {
+  static String TERM_DICT_META_HEADER_CODEC_NAME = "RandomAccessTermsDict";
+  static String TERM_INDEX_HEADER_CODEC_NAME = "RandomAccessTermsDictIndex";
+  static String TERM_DATA_META_HEADER_CODEC_NAME_PREFIX = "RandomAccessTermsDictTermDataMeta";
+  static String TERM_DATA_HEADER_CODEC_NAME_PREFIX = "RandomAccessTermsDictTermData";
+
+  static String TERM_DICT_META_INFO_EXTENSION = "tmeta";
+  static String TERM_INDEX_EXTENSION = "tidx";
+  static String TERM_DATA_META_EXTENSION_PREFIX = "tdm";
+  static String TERM_DATA_EXTENSION_PREFIX = "tdd";
 
   // Increment version to change it
   static final int VERSION_START = 0;
@@ -42,7 +49,7 @@ public final class Lucene99RandomAccessDictionaryPostingsFormat extends Postings
 
   /** Creates {@code Lucene90RandomAccessDictionaryPostingsFormat} */
   public Lucene99RandomAccessDictionaryPostingsFormat() {
-    super("Lucene90RandomAccess");
+    super("Lucene99RandomAccess");
   }
 
   @Override
@@ -52,10 +59,10 @@ public String toString() {
 
   @Override
   public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
+    Lucene99PostingsWriter postingsWriter = new Lucene99PostingsWriter(state);
     boolean success = false;
     try {
-      FieldsConsumer ret = new Lucene99RandomAccessTermsWriter();
+      FieldsConsumer ret = new Lucene99RandomAccessTermsWriter(state, postingsWriter);
       success = true;
       return ret;
     } finally {
@@ -67,10 +74,10 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException
 
   @Override
   public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
-    PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
+    Lucene99PostingsReader postingsReader = new Lucene99PostingsReader(state);
     boolean success = false;
     try {
-      FieldsProducer ret = new Lucene99RandomAccessTermsReader();
+      FieldsProducer ret = new Lucene99RandomAccessTermsReader(postingsReader, state);
       success = true;
       return ret;
     } finally {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
index 79a63dccf265..fac2f6e7675e 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
@@ -17,30 +17,188 @@
 
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*;
+import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.TERM_DATA_HEADER_CODEC_NAME_PREFIX;
+
+import java.io.Closeable;
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.Iterator;
+import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.IOUtils;
+
+final class Lucene99RandomAccessTermsReader extends FieldsProducer {
+  private final Lucene99PostingsReader postingsReader;
+  private final SegmentReadState segmentReadState;
+
+  private final IndexFilesManager indexFilesManager;
+
+  private final HashMap<String, TermsImpl> perFieldTermDict;
+
+  Lucene99RandomAccessTermsReader(
+      Lucene99PostingsReader postingsReader, SegmentReadState segmentReadState) throws IOException {
+    this.postingsReader = postingsReader;
+    this.segmentReadState = segmentReadState;
+    this.indexFilesManager = new IndexFilesManager();
+    this.perFieldTermDict = new HashMap<>();
+    boolean success = false;
+    try {
+      int numFields = indexFilesManager.metaInfoIn.readVInt();
+      assert numFields > 0;
+      for (int i = 0; i < numFields; i++) {
+        RandomAccessTermsDict termsDict =
+            RandomAccessTermsDict.deserialize(
+                fieldNumber -> segmentReadState.fieldInfos.fieldInfo(fieldNumber).getIndexOptions(),
+                indexFilesManager.metaInfoIn,
+                indexFilesManager.termIndexIn,
+                indexFilesManager);
+        FieldInfo fieldInfo =
+            segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber());
+        String fieldName = fieldInfo.name;
+        perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict));
+      }
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
 
-class Lucene99RandomAccessTermsReader extends FieldsProducer {
   @Override
-  public void close() throws IOException {}
+  public void close() throws IOException {
+    try {
+      IOUtils.close(indexFilesManager);
+    } finally {
+      // The per-field term dictionary would be invalid once the underlying index files have been
+      // closed.
+      perFieldTermDict.clear();
+    }
+  }
 
   @Override
-  public void checkIntegrity() throws IOException {}
+  public void checkIntegrity() throws IOException {
+    // Integrity is already checked in indexFilesManager
+  }
 
   @Override
   public Iterator<String> iterator() {
-    return null;
+    return perFieldTermDict.keySet().iterator();
   }
 
   @Override
   public Terms terms(String field) throws IOException {
-    return null;
+    return perFieldTermDict.get(field);
   }
 
   @Override
   public int size() {
-    return 0;
+    return perFieldTermDict.size();
+  }
+
+  class IndexFilesManager implements RandomAccessTermsDict.TermDataInputProvider, Closeable {
+    private final IndexInput metaInfoIn;
+
+    private final IndexInput termIndexIn;
+
+    private final HashMap<TermType, RandomAccessTermsDict.TermDataInput> termDataInputPerType;
+
+    public IndexFilesManager() throws IOException {
+      metaInfoIn = initMetaInfoInput();
+      termIndexIn = initTermIndexInput();
+      termDataInputPerType = new HashMap<>();
+    }
+
+    private IndexInput initMetaInfoInput() throws IOException {
+      final IndexInput tmp;
+      tmp = openAndChecksumIndexInputSafe(TERM_DICT_META_INFO_EXTENSION, false);
+
+      checkHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME);
+      postingsReader.init(tmp, segmentReadState);
+      postingsReader.checkIntegrity();
+      return tmp;
+    }
+
+    private IndexInput initTermIndexInput() throws IOException {
+      final IndexInput tmp = openAndChecksumIndexInputSafe(TERM_INDEX_EXTENSION, true);
+      checkHeader(tmp, TERM_INDEX_HEADER_CODEC_NAME);
+      return tmp;
+    }
+
+    private RandomAccessTermsDict.TermDataInput openTermDataInput(TermType termType)
+        throws IOException {
+      final IndexInput metaTmp;
+      final IndexInput dataTmp;
+      metaTmp =
+          openAndChecksumIndexInputSafe(TERM_DATA_META_EXTENSION_PREFIX + termType.getId(), true);
+      checkHeader(metaTmp, TERM_DATA_META_HEADER_CODEC_NAME_PREFIX + termType.getId());
+
+      dataTmp = openAndChecksumIndexInputSafe(TERM_DATA_EXTENSION_PREFIX + termType.getId(), true);
+      checkHeader(dataTmp, TERM_DATA_HEADER_CODEC_NAME_PREFIX + termType.getId());
+
+      return new RandomAccessTermsDict.TermDataInput(metaTmp, dataTmp);
+    }
+
+    /**
+     * Open an IndexInput for a segment local name. The IndexInput will be closed if there was any
+     * error happened during open and verification.
+     */
+    private IndexInput openAndChecksumIndexInputSafe(
+        String segmentLocalName, boolean needRandomAcees) throws IOException {
+      String name =
+          IndexFileNames.segmentFileName(
+              segmentReadState.segmentInfo.name, segmentReadState.segmentSuffix, segmentLocalName);
+
+      boolean success = false;
+      IndexInput input = null;
+      try {
+        input =
+            segmentReadState.directory.openInput(
+                name, needRandomAcees ? IOContext.LOAD : IOContext.READ);
+        success = true;
+      } finally {
+        if (!success) {
+          IOUtils.closeWhileHandlingException(input);
+        }
+      }
+      CodecUtil.checksumEntireFile(input);
+      return input;
+    }
+
+    private void checkHeader(IndexInput input, String headerName) throws IOException {
+      CodecUtil.checkIndexHeader(
+          input,
+          headerName,
+          Lucene99RandomAccessDictionaryPostingsFormat.VERSION_START,
+          Lucene99RandomAccessDictionaryPostingsFormat.VERSION_CURRENT,
+          segmentReadState.segmentInfo.getId(),
+          segmentReadState.segmentSuffix);
+    }
+
+    @Override
+    public RandomAccessTermsDict.TermDataInput getTermDataInputForType(TermType termType)
+        throws IOException {
+      RandomAccessTermsDict.TermDataInput current = termDataInputPerType.get(termType);
+      if (current == null) {
+        current = openTermDataInput(termType);
+        termDataInputPerType.put(termType, current);
+      }
+      return current;
+    }
+
+    @Override
+    public void close() throws IOException {
+      IOUtils.close(metaInfoIn, termIndexIn);
+      for (var x : termDataInputPerType.values()) {
+        IOUtils.close(x.metadataInput(), x.dataInput());
+      }
+    }
   }
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
index 87b68d2b9c63..b38c724839fa 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
@@ -16,15 +16,211 @@
  */
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
+import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*;
+
 import java.io.IOException;
+import java.util.HashMap;
+import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsConsumer;
 import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
+import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.RandomAccessTermsDictWriter.TermDataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+
+final class Lucene99RandomAccessTermsWriter extends FieldsConsumer {
+
+  private final SegmentWriteState segmentWriteState;
+
+  private final Lucene99PostingsWriter postingsWriter;
+
+  private final IndexFilesManager indexFilesManager;
+
+  private boolean closed;
+
+  public Lucene99RandomAccessTermsWriter(
+      SegmentWriteState segmentWriteState, Lucene99PostingsWriter postingsWriter)
+      throws IOException {
+    this.segmentWriteState = segmentWriteState;
+    this.postingsWriter = postingsWriter;
+    this.indexFilesManager = new IndexFilesManager();
+  }
 
-class Lucene99RandomAccessTermsWriter extends FieldsConsumer {
   @Override
-  public void write(Fields fields, NormsProducer norms) throws IOException {}
+  public void write(Fields fields, NormsProducer norms) throws IOException {
+    HashMap<String, Terms> nonEmptyFields = new HashMap<>();
+    for (String field : fields) {
+      Terms terms = fields.terms(field);
+      if (terms != null) {
+        nonEmptyFields.put(field, terms);
+      }
+    }
+    indexFilesManager.metaInfoOut.writeVInt(nonEmptyFields.size());
+
+    FixedBitSet docSeen = new FixedBitSet(segmentWriteState.segmentInfo.maxDoc());
+    for (var entry : nonEmptyFields.entrySet()) {
+      TermsEnum termsEnum = entry.getValue().iterator();
+      FieldInfo fieldInfo = segmentWriteState.fieldInfos.fieldInfo(entry.getKey());
+      RandomAccessTermsDictWriter termsDictWriter =
+          new RandomAccessTermsDictWriter(
+              fieldInfo.number,
+              fieldInfo.getIndexOptions(),
+              indexFilesManager.metaInfoOut,
+              indexFilesManager.termIndexOut,
+              indexFilesManager);
+      postingsWriter.setField(fieldInfo);
+
+      docSeen.clear();
+      while (true) {
+        BytesRef term = termsEnum.next();
+        if (term == null) {
+          break;
+        }
+
+        IntBlockTermState termState =
+            (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms);
+        termsDictWriter.add(term, termState);
+      }
+      termsDictWriter.finish(docSeen.cardinality());
+    }
+  }
 
   @Override
-  public void close() throws IOException {}
+  public void close() throws IOException {
+    if (closed) {
+      return;
+    }
+    indexFilesManager.close();
+    closed = true;
+  }
+
+  /**
+   * Manages the output index files needed. It handles adding indexing header on creation and footer
+   * upon closing.
+   */
+  class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputProvider {
+
+    private final IndexOutput metaInfoOut;
+
+    private final IndexOutput termIndexOut;
+
+    private final HashMap<TermType, TermDataOutput> termDataOutputPerType;
+
+    public IndexFilesManager() throws IOException {
+      metaInfoOut = initMetaInfoOutput();
+      termIndexOut = initTermIndexOutput();
+      // populate the per-TermType term data outputs on-demand.
+      termDataOutputPerType = new HashMap<>();
+    }
+
+    private IndexOutput initMetaInfoOutput() throws IOException {
+      final IndexOutput tmp;
+      tmp = getIndexOutputSafe(TERM_DICT_META_INFO_EXTENSION);
+      writeHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME);
+      postingsWriter.init(tmp, segmentWriteState);
+      return tmp;
+    }
+
+    private IndexOutput initTermIndexOutput() throws IOException {
+      final IndexOutput tmp = getIndexOutputSafe(TERM_INDEX_EXTENSION);
+      writeHeader(tmp, TERM_INDEX_HEADER_CODEC_NAME);
+      return tmp;
+    }
+
+    private TermDataOutput initTermDataOutput(TermType termType) throws IOException {
+      final IndexOutput metaTmp;
+      final IndexOutput dataTmp;
+      metaTmp = getIndexOutputSafe(TERM_DATA_META_EXTENSION_PREFIX + termType.getId());
+      writeHeader(metaTmp, TERM_DATA_META_HEADER_CODEC_NAME_PREFIX + termType.getId());
+
+      dataTmp = getIndexOutputSafe(TERM_DATA_EXTENSION_PREFIX + termType.getId());
+      writeHeader(dataTmp, TERM_DATA_HEADER_CODEC_NAME_PREFIX + termType.getId());
+
+      return new TermDataOutput(metaTmp, dataTmp);
+    }
+
+    /**
+     * Get an IndexOutput for a segment local name. The output will be closed if there was any error
+     * happened during creation.
+     */
+    private IndexOutput getIndexOutputSafe(String segmentLocalName) throws IOException {
+      String name =
+          IndexFileNames.segmentFileName(
+              segmentWriteState.segmentInfo.name,
+              segmentWriteState.segmentSuffix,
+              segmentLocalName);
+
+      boolean success = false;
+      IndexOutput output = null;
+      try {
+        output = segmentWriteState.directory.createOutput(name, segmentWriteState.context);
+        success = true;
+      } finally {
+        if (!success) {
+          IOUtils.closeWhileHandlingException(output);
+        }
+      }
+      return output;
+    }
+
+    private void writeHeader(IndexOutput output, String headerName) throws IOException {
+      CodecUtil.writeIndexHeader(
+          output,
+          headerName,
+          Lucene99RandomAccessDictionaryPostingsFormat.VERSION_CURRENT,
+          segmentWriteState.segmentInfo.getId(),
+          segmentWriteState.segmentSuffix);
+    }
+
+    @Override
+    public TermDataOutput getTermDataOutputForType(TermType termType) throws IOException {
+      TermDataOutput current = termDataOutputPerType.get(termType);
+      if (current == null) {
+        current = initTermDataOutput(termType);
+        termDataOutputPerType.put(termType, current);
+      }
+      return current;
+    }
+
+    /**
+     * Write footers for all created index files and close them.
+     *
+     * <p>Assume all index files are valid upto time of calling.
+     */
+    void close() throws IOException {
+      boolean success = false;
+      try {
+        CodecUtil.writeFooter(metaInfoOut);
+        CodecUtil.writeFooter(termIndexOut);
+        for (var termDataOutput : termDataOutputPerType.values()) {
+          CodecUtil.writeFooter(termDataOutput.metadataOutput());
+          CodecUtil.writeFooter(termDataOutput.dataOutput());
+        }
+        success = true;
+      } finally {
+        if (success) {
+          IOUtils.close(metaInfoOut, termIndexOut);
+          for (var termDataOutput : termDataOutputPerType.values()) {
+            IOUtils.close(termDataOutput.metadataOutput());
+            IOUtils.close(termDataOutput.dataOutput());
+          }
+        } else {
+          IOUtils.closeWhileHandlingException(metaInfoOut, termIndexOut);
+          for (var termDataOutput : termDataOutputPerType.values()) {
+            IOUtils.closeWhileHandlingException(
+                termDataOutput.metadataOutput(), termDataOutput.dataOutput());
+          }
+        }
+      }
+    }
+  }
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
index 030d144e60d2..e039eb8b7a49 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
@@ -28,6 +28,7 @@
 final class RandomAccessTermsDictWriter {
   /** externally provided * */
   private final IndexOptions indexOptions;
+
   private final DataOutput metaOutput;
 
   private final DataOutput indexOutput;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index edbf1141457f..8a91ce7fd2c6 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -30,7 +30,7 @@ final class TermsImpl extends Terms {
 
   private final RandomAccessTermsDict termsDict;
 
-  public TermsImpl(TermsStats stats, FieldInfo fieldInfo, RandomAccessTermsDict termsDict) {
+  public TermsImpl(FieldInfo fieldInfo, RandomAccessTermsDict termsDict) {
     this.fieldInfo = fieldInfo;
     this.termsDict = termsDict;
   }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
index 2ddc5b4ee67f..6b316330ecad 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
@@ -18,7 +18,6 @@
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
-import java.util.Arrays;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TestTermStateCodecImpl.TermStateTestFixture;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPerBytePacker;
@@ -28,6 +27,7 @@
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.ArrayUtil;
 
 public class TestTermDataWriter extends LuceneTestCase {
 
@@ -62,7 +62,7 @@ public void testWriterAndDeserialize() throws IOException {
             testFixture
                 .codec()
                 .encodeBlock(
-                    Arrays.copyOfRange(
+                    ArrayUtil.copyOfSubArray(
                         testFixture.termStatesArray(),
                         start,
                         Math.min(

From 4df3ad1fca0a5c1269ad126a1c22c742add05d66 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 20 Nov 2023 15:04:09 -0800
Subject: [PATCH 33/57] Move the package from sandbox to codecs

Currently the tests can exercise PostingsFormats or other SPI provided implementation from sandbox.
---
 lucene/codecs/src/java/module-info.java                       | 4 +++-
 .../codecs/lucene99/randomaccess/ByteArrayByteSlice.java      | 0
 .../sandbox/codecs/lucene99/randomaccess/ByteSlice.java       | 0
 .../Lucene99RandomAccessDictionaryPostingsFormat.java         | 0
 .../randomaccess/Lucene99RandomAccessTermsReader.java         | 0
 .../randomaccess/Lucene99RandomAccessTermsWriter.java         | 0
 .../lucene99/randomaccess/RandomAccessInputByteSlice.java     | 0
 .../codecs/lucene99/randomaccess/RandomAccessTermsDict.java   | 0
 .../lucene99/randomaccess/RandomAccessTermsDictWriter.java    | 0
 .../lucene/sandbox/codecs/lucene99/randomaccess/TermData.java | 0
 .../sandbox/codecs/lucene99/randomaccess/TermDataReader.java  | 0
 .../sandbox/codecs/lucene99/randomaccess/TermDataWriter.java  | 0
 .../sandbox/codecs/lucene99/randomaccess/TermStateCodec.java  | 0
 .../codecs/lucene99/randomaccess/TermStateCodecComponent.java | 0
 .../codecs/lucene99/randomaccess/TermStateCodecImpl.java      | 0
 .../lucene/sandbox/codecs/lucene99/randomaccess/TermType.java | 0
 .../sandbox/codecs/lucene99/randomaccess/TermsImpl.java       | 0
 .../sandbox/codecs/lucene99/randomaccess/TermsIndex.java      | 0
 .../codecs/lucene99/randomaccess/TermsIndexBuilder.java       | 0
 .../sandbox/codecs/lucene99/randomaccess/TermsStats.java      | 0
 .../codecs/lucene99/randomaccess/bitpacking/BitPacker.java    | 0
 .../lucene99/randomaccess/bitpacking/BitPackerImplBase.java   | 0
 .../codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java  | 0
 .../lucene99/randomaccess/bitpacking/BitUnpackerImpl.java     | 0
 .../lucene99/randomaccess/bitpacking/DataOutputBitPacker.java | 0
 .../randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java  | 0
 .../codecs/lucene99/randomaccess/bitpacking/package-info.java | 0
 .../sandbox/codecs/lucene99/randomaccess/package-info.java    | 0
 .../META-INF/services/org.apache.lucene.codecs.PostingsFormat | 1 +
 .../randomaccess/TestRandomAccessTermsDictWriter.java         | 0
 .../codecs/lucene99/randomaccess/TestTermDataWriter.java      | 0
 .../lucene99/randomaccess/TestTermStateCodecComponent.java    | 0
 .../codecs/lucene99/randomaccess/TestTermStateCodecImpl.java  | 0
 .../codecs/lucene99/randomaccess/TestTermsIndexBuilder.java   | 0
 .../sandbox/codecs/lucene99/randomaccess/TestTermsStats.java  | 0
 .../lucene99/randomaccess/bitpacking/BitPerBytePacker.java    | 0
 .../lucene99/randomaccess/bitpacking/TestBitPackerImpl.java   | 0
 .../lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java | 0
 .../lucene99/randomaccess/bitpacking/ValueAndBitWidth.java    | 0
 lucene/sandbox/src/java/module-info.java                      | 2 --
 40 files changed, 4 insertions(+), 3 deletions(-)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java (100%)
 rename lucene/{sandbox => codecs}/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java (100%)
 rename lucene/{sandbox => codecs}/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java (100%)

diff --git a/lucene/codecs/src/java/module-info.java b/lucene/codecs/src/java/module-info.java
index 73f53fbf96b9..bc327c8debbd 100644
--- a/lucene/codecs/src/java/module-info.java
+++ b/lucene/codecs/src/java/module-info.java
@@ -33,7 +33,9 @@
       org.apache.lucene.codecs.memory.DirectPostingsFormat,
       org.apache.lucene.codecs.memory.FSTPostingsFormat,
       org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat,
-      org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat;
+      org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat,
+      org.apache.lucene.sandbox.codecs.lucene99.randomaccess
+          .Lucene99RandomAccessDictionaryPostingsFormat;
   provides org.apache.lucene.codecs.Codec with
       org.apache.lucene.codecs.simpletext.SimpleTextCodec;
 }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessDictionaryPostingsFormat.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataWriter.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecComponent.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermType.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPacker.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPackerImplBase.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpacker.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/DataOutputBitPacker.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/FixedSizeByteArrayBitPacker.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/package-info.java
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java
similarity index 100%
rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java
rename to lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/package-info.java
diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
index 09f2491c8012..e060907b8032 100644
--- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
+++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
@@ -19,3 +19,4 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat
 org.apache.lucene.codecs.memory.FSTPostingsFormat
 org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat
 org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat
+org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecComponent.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsStats.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitPerBytePacker.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitPackerImpl.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/TestBitUnpackerImpl.java
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java
similarity index 100%
rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java
rename to lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/ValueAndBitWidth.java
diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java
index 45b66e7c353e..c51a25691ef2 100644
--- a/lucene/sandbox/src/java/module-info.java
+++ b/lucene/sandbox/src/java/module-info.java
@@ -22,8 +22,6 @@
 
   exports org.apache.lucene.payloads;
   exports org.apache.lucene.sandbox.codecs.idversion;
-  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
-  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
   exports org.apache.lucene.sandbox.document;
   exports org.apache.lucene.sandbox.queries;
   exports org.apache.lucene.sandbox.search;

From f57ddbb7ad1336acef87dc95ffe5582ff1a95a07 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 20 Nov 2023 15:06:34 -0800
Subject: [PATCH 34/57] Fix bugs in writing Lucene99RandomAccessTermsW

bug 1: negative sumTotalTermFeqs.
bug 2: not closing the postings reader/writer.
---
 .../randomaccess/Lucene99RandomAccessTermsReader.java     | 2 +-
 .../randomaccess/Lucene99RandomAccessTermsWriter.java     | 8 +++++---
 .../randomaccess/RandomAccessTermsDictWriter.java         | 4 +++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
index fac2f6e7675e..ed185751475e 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
@@ -75,7 +75,7 @@ final class Lucene99RandomAccessTermsReader extends FieldsProducer {
   @Override
   public void close() throws IOException {
     try {
-      IOUtils.close(indexFilesManager);
+      IOUtils.close(indexFilesManager, postingsReader);
     } finally {
       // The per-field term dictionary would be invalid once the underlying index files have been
       // closed.
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
index b38c724839fa..192702b7580d 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
@@ -18,6 +18,7 @@
 
 import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessDictionaryPostingsFormat.*;
 
+import java.io.Closeable;
 import java.io.IOException;
 import java.util.HashMap;
 import org.apache.lucene.codecs.CodecUtil;
@@ -99,7 +100,8 @@ public void close() throws IOException {
     if (closed) {
       return;
     }
-    indexFilesManager.close();
+    IOUtils.close(indexFilesManager, postingsWriter);
+
     closed = true;
   }
 
@@ -107,7 +109,7 @@ public void close() throws IOException {
    * Manages the output index files needed. It handles adding indexing header on creation and footer
    * upon closing.
    */
-  class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputProvider {
+  class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputProvider, Closeable {
 
     private final IndexOutput metaInfoOut;
 
@@ -196,7 +198,7 @@ public TermDataOutput getTermDataOutputForType(TermType termType) throws IOExcep
      *
      * <p>Assume all index files are valid upto time of calling.
      */
-    void close() throws IOException {
+    public void close() throws IOException {
       boolean success = false;
       try {
         CodecUtil.writeFooter(metaInfoOut);
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
index e039eb8b7a49..6ea363bb2a0d 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
@@ -154,7 +154,9 @@ static final class TermStatsTracker {
     void recordTerm(IntBlockTermState termState) {
       size += 1;
       sumDocFreq += termState.docFreq;
-      sumTotalTermFreq += termState.totalTermFreq;
+      if (termState.totalTermFreq > 0) {
+        sumTotalTermFreq += termState.totalTermFreq;
+      }
     }
 
     void setDocCount(int docCount) {

From c66808dcde15e2fa3e1447e313d9eb7396ce0ccf Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Tue, 21 Nov 2023 15:53:50 -0800
Subject: [PATCH 35/57] Implement TermsEnum for Lucene99RandomAccess

---
 lucene/codecs/src/java/module-info.java       |   2 +
 .../Lucene99RandomAccessTermsReader.java      |  20 ++-
 .../Lucene99RandomAccessTermsWriter.java      |   6 +-
 .../randomaccess/RandomAccessTermsDict.java   |  23 ++-
 .../RandomAccessTermsDictWriter.java          |  34 +++-
 .../lucene99/randomaccess/TermData.java       |  29 +++-
 .../lucene99/randomaccess/TermDataReader.java |  20 ++-
 .../randomaccess/TermStateCodecImpl.java      |  10 +-
 .../lucene99/randomaccess/TermsImpl.java      | 164 +++++++++++++++++-
 .../lucene99/randomaccess/TermsIndex.java     |  16 +-
 .../lucene99/randomaccess/TermsStats.java     |  26 ++-
 .../bitpacking/BitUnpackerImpl.java           |   4 +
 .../TestRandomAccessTermsDictWriter.java      |  22 ++-
 .../randomaccess/TestTermDataWriter.java      |  12 +-
 .../randomaccess/TestTermStateCodecImpl.java  |  36 +++-
 15 files changed, 355 insertions(+), 69 deletions(-)

diff --git a/lucene/codecs/src/java/module-info.java b/lucene/codecs/src/java/module-info.java
index bc327c8debbd..a128950ddb56 100644
--- a/lucene/codecs/src/java/module-info.java
+++ b/lucene/codecs/src/java/module-info.java
@@ -26,6 +26,8 @@
   exports org.apache.lucene.codecs.simpletext;
   exports org.apache.lucene.codecs.uniformsplit;
   exports org.apache.lucene.codecs.uniformsplit.sharedterms;
+  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+  exports org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking;
 
   provides org.apache.lucene.codecs.PostingsFormat with
       org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat,
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
index ed185751475e..ab3285af297f 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.store.IOContext;
@@ -56,14 +57,25 @@ final class Lucene99RandomAccessTermsReader extends FieldsProducer {
       for (int i = 0; i < numFields; i++) {
         RandomAccessTermsDict termsDict =
             RandomAccessTermsDict.deserialize(
-                fieldNumber -> segmentReadState.fieldInfos.fieldInfo(fieldNumber).getIndexOptions(),
+                new RandomAccessTermsDict.IndexOptionsProvider() {
+                  @Override
+                  public IndexOptions getIndexOptions(int fieldNumber) {
+                    return segmentReadState.fieldInfos.fieldInfo(fieldNumber).getIndexOptions();
+                  }
+
+                  @Override
+                  public boolean hasPayloads(int fieldNumber) {
+                    return segmentReadState.fieldInfos.fieldInfo(fieldNumber).hasPayloads();
+                  }
+                },
                 indexFilesManager.metaInfoIn,
                 indexFilesManager.termIndexIn,
                 indexFilesManager);
         FieldInfo fieldInfo =
             segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber());
         String fieldName = fieldInfo.name;
-        perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict));
+        perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict, postingsReader));
+        success = true;
       }
     } finally {
       if (!success) {
@@ -151,7 +163,7 @@ private RandomAccessTermsDict.TermDataInput openTermDataInput(TermType termType)
      * error happened during open and verification.
      */
     private IndexInput openAndChecksumIndexInputSafe(
-        String segmentLocalName, boolean needRandomAcees) throws IOException {
+        String segmentLocalName, boolean needRandomAccess) throws IOException {
       String name =
           IndexFileNames.segmentFileName(
               segmentReadState.segmentInfo.name, segmentReadState.segmentSuffix, segmentLocalName);
@@ -161,7 +173,7 @@ private IndexInput openAndChecksumIndexInputSafe(
       try {
         input =
             segmentReadState.directory.openInput(
-                name, needRandomAcees ? IOContext.LOAD : IOContext.READ);
+                name, needRandomAccess ? IOContext.LOAD : IOContext.READ);
         success = true;
       } finally {
         if (!success) {
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
index 192702b7580d..bc6aebf1a8de 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
@@ -75,6 +75,7 @@ public void write(Fields fields, NormsProducer norms) throws IOException {
           new RandomAccessTermsDictWriter(
               fieldInfo.number,
               fieldInfo.getIndexOptions(),
+              fieldInfo.hasPayloads(),
               indexFilesManager.metaInfoOut,
               indexFilesManager.termIndexOut,
               indexFilesManager);
@@ -89,7 +90,10 @@ public void write(Fields fields, NormsProducer norms) throws IOException {
 
         IntBlockTermState termState =
             (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms);
-        termsDictWriter.add(term, termState);
+        // TermState can be null
+        if (termState != null) {
+          termsDictWriter.add(term, termState);
+        }
       }
       termsDictWriter.finish(docSeen.cardinality());
     }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
index 39947f9ff78c..1d1c3e194f40 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
@@ -26,11 +26,14 @@
 
 /** A term dictionary that offer random-access to read a specific term */
 record RandomAccessTermsDict(
-    TermsStats termsStats, TermsIndex termsIndex, TermDataReader termDataReader) {
+    TermsStats termsStats,
+    TermsIndex termsIndex,
+    TermDataReader termDataReader,
+    IndexOptions indexOptions) {
 
   IntBlockTermState getTermState(BytesRef term) throws IOException {
     TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term);
-    return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord());
+    return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord(), indexOptions);
   }
 
   static RandomAccessTermsDict deserialize(
@@ -43,17 +46,21 @@ static RandomAccessTermsDict deserialize(
     // (1) deserialize field stats
     TermsStats termsStats = TermsStats.deserialize(metaInput);
     IndexOptions indexOptions = indexOptionsProvider.getIndexOptions(termsStats.fieldNumber());
+    boolean hasPayloads = indexOptionsProvider.hasPayloads(termsStats.fieldNumber());
 
     // (2) deserialize terms index
-    TermsIndex termsIndex =
-        TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true);
+    TermsIndex termsIndex = null;
+    if (termsStats.size() > 0) {
+      termsIndex = TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true);
+    }
 
     // (3) deserialize all the term data by each TermType
     // (3.1) number of unique TermType this field has
     int numTermTypes = metaInput.readByte();
 
     // (3.2) read per TermType
-    TermDataReader.Builder termDataReaderBuilder = new TermDataReader.Builder(indexOptions);
+    TermDataReader.Builder termDataReaderBuilder =
+        new TermDataReader.Builder(indexOptions, hasPayloads);
     for (int i = 0; i < numTermTypes; i++) {
       TermType termType = TermType.fromId(metaInput.readByte());
       TermDataInput termDataInput = termDataInputProvider.getTermDataInputForType(termType);
@@ -61,13 +68,15 @@ static RandomAccessTermsDict deserialize(
           termType, metaInput, termDataInput.metadataInput, termDataInput.dataInput);
     }
 
-    return new RandomAccessTermsDict(termsStats, termsIndex, termDataReaderBuilder.build());
+    return new RandomAccessTermsDict(
+        termsStats, termsIndex, termDataReaderBuilder.build(), indexOptions);
   }
 
-  @FunctionalInterface
   interface IndexOptionsProvider {
 
     IndexOptions getIndexOptions(int fieldNumber);
+
+    boolean hasPayloads(int fieldNumber);
   }
 
   record TermDataInput(IndexInput metadataInput, IndexInput dataInput) {}
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
index 6ea363bb2a0d..6a8a4a6a5f74 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
@@ -23,14 +23,15 @@
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
 
 /** Class to write the index files for one field. */
 final class RandomAccessTermsDictWriter {
   /** externally provided * */
   private final IndexOptions indexOptions;
 
+  private final boolean hasPayloads;
   private final DataOutput metaOutput;
-
   private final DataOutput indexOutput;
 
   private final TermDataOutputProvider termDataOutputProvider;
@@ -46,15 +47,17 @@ final class RandomAccessTermsDictWriter {
 
   private final TermStatsTracker termStatsTracker;
 
-  private BytesRef previousTerm;
+  private BytesRefBuilder previousTerm;
 
   RandomAccessTermsDictWriter(
       int filedNumber,
       IndexOptions indexOptions,
+      boolean hasPayloads,
       DataOutput metaOutput,
       DataOutput indexOutput,
       TermDataOutputProvider termDataOutputProvider) {
     this.indexOptions = indexOptions;
+    this.hasPayloads = hasPayloads;
     this.metaOutput = metaOutput;
     this.indexOutput = indexOutput;
     this.termDataOutputProvider = termDataOutputProvider;
@@ -66,9 +69,22 @@ void add(BytesRef term, IntBlockTermState termState) throws IOException {
     if (previousTerm == null) {
       // first term, which is also the minimum term
       termStatsTracker.setMinTerm(term);
+      previousTerm = new BytesRefBuilder();
+    }
+
+    /* There is interesting conventions to follow...
+     * <pre>
+     *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+     *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+     * </pre>
+     */
+    // for field that do not have freq enabled, as if each posting only has one occurrence.
+    if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) {
+      termState.totalTermFreq = termState.docFreq;
     }
+
     termStatsTracker.recordTerm(termState);
-    previousTerm = term;
+    previousTerm.copyBytes(term);
     termsIndexBuilder.addTerm(term, termType);
     TermDataWriter termDataWriter = getTermDataWriterForType(termType);
     termDataWriter.addTermState(termState);
@@ -82,7 +98,7 @@ private TermDataWriter getTermDataWriterForType(TermType termType) throws IOExce
     TermDataOutput termDataOutput = getTermDataOutput(termType);
     TermDataWriter termDataWriter =
         new TermDataWriter(
-            TermStateCodecImpl.getCodec(termType, indexOptions),
+            TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads),
             termDataOutput.metadataOutput(),
             termDataOutput.dataOutput());
     termDataWriterPerType[termType.getId()] = termDataWriter;
@@ -99,7 +115,9 @@ private TermDataOutput getTermDataOutput(TermType termType) throws IOException {
 
   void finish(int docCount) throws IOException {
     // finish up TermsStats for this field
-    termStatsTracker.setMaxTerm(previousTerm);
+    if (previousTerm != null) {
+      termStatsTracker.setMaxTerm(previousTerm.toBytesRef());
+    }
     termStatsTracker.setDocCount(docCount);
     TermsStats termsStats = termStatsTracker.finish();
     // (1) Write field metadata
@@ -164,16 +182,14 @@ void setDocCount(int docCount) {
     }
 
     void setMinTerm(BytesRef minTerm) {
-      this.minTerm = minTerm;
+      this.minTerm = BytesRef.deepCopyOf(minTerm);
     }
 
     void setMaxTerm(BytesRef maxTerm) {
-      this.maxTerm = maxTerm;
+      this.maxTerm = BytesRef.deepCopyOf(maxTerm);
     }
 
     TermsStats finish() {
-      assert docCount > 0 && minTerm != null && maxTerm != null;
-
       return new TermsStats(
           fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm);
     }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
index 9c74ffc83835..3860ba1a3f4b 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
@@ -22,16 +22,18 @@
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.RandomAccessInput;
 import org.apache.lucene.util.BytesRef;
 
 /**
  * Holds the bit-packed {@link IntBlockTermState} for a given {@link
  * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType}
  */
-record TermData(ByteSlice metadata, ByteSlice data) {
+record TermData(ByteSliceProvider metadataProvider, ByteSliceProvider dataProvider) {
 
   IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException {
+    var metadata = metadataProvider.newByteSlice();
+    var data = dataProvider.newByteSlice();
+
     long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK;
     long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8);
     long dataStartPos = metadata.getLong(metadataStartPos);
@@ -71,21 +73,30 @@ static TermData deserializeOnHeap(
     metadataInput.readBytes(metadataBytes, 0, metadataBytes.length);
     dataInput.readBytes(dataBytes, 0, dataBytes.length);
 
-    return new TermData(new ByteArrayByteSlice(metadataBytes), new ByteArrayByteSlice(dataBytes));
+    return new TermData(
+        () -> new ByteArrayByteSlice(metadataBytes), () -> new ByteArrayByteSlice(dataBytes));
   }
 
   static TermData deserializeOffHeap(
       DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException {
-    long metadataSize = metaInput.readVLong();
-    long dataSize = metaInput.readVLong();
+    final long metadataSize = metaInput.readVLong();
+    final long dataSize = metaInput.readVLong();
+
+    final long metadataStart = metadataInput.getFilePointer();
+    final long dataStart = dataInput.getFilePointer();
 
-    RandomAccessInput metadata =
-        metadataInput.randomAccessSlice(metadataInput.getFilePointer(), metadataSize);
     metadataInput.skipBytes(metadataSize);
-    RandomAccessInput data = dataInput.randomAccessSlice(dataInput.getFilePointer(), dataSize);
     dataInput.skipBytes(dataSize);
 
     return new TermData(
-        new RandomAccessInputByteSlice(metadata), new RandomAccessInputByteSlice(data));
+        () ->
+            new RandomAccessInputByteSlice(
+                metadataInput.randomAccessSlice(metadataStart, metadataSize)),
+        () -> new RandomAccessInputByteSlice(dataInput.randomAccessSlice(dataStart, dataSize)));
+  }
+
+  @FunctionalInterface
+  interface ByteSliceProvider {
+    ByteSlice newByteSlice() throws IOException;
   }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
index 3a7ebd1e8a7c..fd5a44fc76b1 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
@@ -29,7 +29,8 @@
  */
 record TermDataReader(TermDataAndCodec[] termDataAndCodecs) {
 
-  IntBlockTermState getTermState(TermType termType, long ord) throws IOException {
+  IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions)
+      throws IOException {
     assert termDataAndCodecs[termType.getId()] != null;
     var dataAndCodec = termDataAndCodecs[termType.getId()];
     IntBlockTermState termState = dataAndCodec.termData.getTermState(dataAndCodec.codec, ord);
@@ -46,22 +47,35 @@ IntBlockTermState getTermState(TermType termType, long ord) throws IOException {
       termState.lastPosBlockOffset = -1;
     }
 
+    /* There is interesting conventions to follow...
+     * <pre>
+     *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+     *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+     * </pre>
+     */
+    // for field that do not have freq enabled, as if each posting only has one occurrence.
+    if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) {
+      termState.totalTermFreq = termState.docFreq;
+    }
+
     return termState;
   }
 
   static class Builder {
     final IndexOptions indexOptions;
+    final boolean hasPayloads;
     final TermDataAndCodec[] termDataAndCodecs = new TermDataAndCodec[TermType.NUM_TOTAL_TYPES];
 
-    Builder(IndexOptions indexOptions) {
+    Builder(IndexOptions indexOptions, boolean hasPayloads) {
       this.indexOptions = indexOptions;
+      this.hasPayloads = hasPayloads;
     }
 
     void readOne(
         TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn)
         throws IOException {
       TermData termData = TermData.deserializeOffHeap(metaIn, termMetadataIn, termDataIn);
-      TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions);
+      TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads);
       termDataAndCodecs[termType.getId()] = new TermDataAndCodec(termData, codec);
     }
 
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index 734e24a7a057..d1a8392a37a9 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -66,12 +66,17 @@ private static int getMetadataLength(TermStateCodecComponent component) {
     return 1 + (component.isMonotonicallyIncreasing() ? 8 : 0);
   }
 
-  public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexOptions) {
+  public static TermStateCodecImpl getCodec(
+      TermType termType, IndexOptions indexOptions, boolean hasPayloads) {
     assert indexOptions.ordinal() > IndexOptions.NONE.ordinal();
     // A term can't have skip data (has more than one block's worth of doc),
     // while having a singleton doc at the same time!
     assert !(termType.hasSkipData() && termType.hasSingletonDoc());
 
+    // Can't have payload for index options that is less than POSITIONS
+    assert indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()
+        || !hasPayloads;
+
     ArrayList<TermStateCodecComponent> components = new ArrayList<>();
     // handle docs and docFreq
     if (termType.hasSingletonDoc()) {
@@ -92,6 +97,9 @@ public static TermStateCodecImpl getCodec(TermType termType, IndexOptions indexO
     // handle positions
     if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
       components.add(PositionStartFP.INSTANCE);
+      if (hasPayloads) {
+        components.add(PayloadStartFP.INSTANCE);
+      }
       if (termType.hasLastPositionBlockOffset()) {
         components.add(LastPositionBlockOffset.INSTANCE);
       }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index 8a91ce7fd2c6..36e861aaf6f8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -18,21 +18,33 @@
 package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
 
 import java.io.IOException;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
 import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.ImpactsEnum;
 import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
 
 final class TermsImpl extends Terms {
   private final FieldInfo fieldInfo;
 
   private final RandomAccessTermsDict termsDict;
 
-  public TermsImpl(FieldInfo fieldInfo, RandomAccessTermsDict termsDict) {
+  private final Lucene99PostingsReader lucene99PostingsReader;
+
+  public TermsImpl(
+      FieldInfo fieldInfo,
+      RandomAccessTermsDict termsDict,
+      Lucene99PostingsReader lucene99PostingsReader) {
     this.fieldInfo = fieldInfo;
     this.termsDict = termsDict;
+    this.lucene99PostingsReader = lucene99PostingsReader;
   }
 
   @Override
@@ -89,13 +101,149 @@ public BytesRef getMax() throws IOException {
 
   @Override
   public TermsEnum iterator() throws IOException {
-    // TODO: implement me
-    return null;
+    if (size() == 0) {
+      return TermsEnum.EMPTY;
+    }
+    return new RandomAccessTermsEnum();
   }
 
-  @Override
-  public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
-    // TODO: implement me
-    return null;
+  // TODO: implement a more efficient version via FST
+  //  @Override
+  //  public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException
+  // {
+  //    return null;
+  //  }
+
+  final class RandomAccessTermsEnum extends TermsEnum {
+    private AttributeSource attrs;
+
+    private BytesRef term;
+
+    private boolean isTermStateCurrent;
+
+    private IntBlockTermState termState;
+
+    private final BytesRefFSTEnum<Long> fstEnum;
+
+    private BytesRefFSTEnum.InputOutput<Long> fstSeekState;
+
+    // Only set when seekExact(term, state) is called, because that will update
+    // the termState but leave the fstSeekState out of sync.
+    // We need to re-seek in next() calls to catch up to that term.
+    private boolean needReSeekInNext;
+
+    RandomAccessTermsEnum() {
+      termState = (IntBlockTermState) lucene99PostingsReader.newTermState();
+      fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst());
+    }
+
+    void updateTermStateIfNeeded() throws IOException {
+      if (!isTermStateCurrent && !needReSeekInNext) {
+        TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstSeekState.output);
+        termState =
+            termsDict
+                .termDataReader()
+                .getTermState(typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions());
+        isTermStateCurrent = true;
+      }
+    }
+
+    @Override
+    public AttributeSource attributes() {
+      if (attrs == null) {
+        attrs = new AttributeSource();
+      }
+      return attrs;
+    }
+
+    @Override
+    public boolean seekExact(BytesRef text) throws IOException {
+      fstSeekState = fstEnum.seekExact(text);
+      term = fstSeekState == null ? null : fstSeekState.input;
+      isTermStateCurrent = false;
+      needReSeekInNext = false;
+      return term != null;
+    }
+
+    @Override
+    public SeekStatus seekCeil(BytesRef text) throws IOException {
+      fstSeekState = fstEnum.seekCeil(text);
+      term = fstSeekState == null ? null : fstSeekState.input;
+      isTermStateCurrent = false;
+      needReSeekInNext = false;
+      if (term == null) {
+        return SeekStatus.END;
+      }
+      return text.equals(term) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+    }
+
+    @Override
+    public void seekExact(BytesRef target, TermState state) throws IOException {
+      if (!target.equals(term)) {
+        assert state instanceof IntBlockTermState;
+        termState.copyFrom(state);
+        term = BytesRef.deepCopyOf(target);
+        isTermStateCurrent = true;
+        needReSeekInNext = true;
+      }
+    }
+
+    @Override
+    public BytesRef term() throws IOException {
+      return term;
+    }
+
+    @Override
+    public int docFreq() throws IOException {
+      updateTermStateIfNeeded();
+      return termState.docFreq;
+    }
+
+    @Override
+    public long totalTermFreq() throws IOException {
+      updateTermStateIfNeeded();
+      return termState.totalTermFreq;
+    }
+
+    @Override
+    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
+      updateTermStateIfNeeded();
+      return lucene99PostingsReader.postings(fieldInfo, termState, reuse, flags);
+    }
+
+    @Override
+    public ImpactsEnum impacts(int flags) throws IOException {
+      updateTermStateIfNeeded();
+      return lucene99PostingsReader.impacts(fieldInfo, termState, flags);
+    }
+
+    @Override
+    public TermState termState() throws IOException {
+      updateTermStateIfNeeded();
+      return termState.clone();
+    }
+
+    @Override
+    public BytesRef next() throws IOException {
+      if (needReSeekInNext) {
+        fstSeekState = fstEnum.seekExact(term);
+        assert fstSeekState != null;
+      }
+      fstSeekState = fstEnum.next();
+      term = fstSeekState == null ? null : fstSeekState.input;
+      isTermStateCurrent = false;
+      needReSeekInNext = false;
+      return term;
+    }
+
+    @Override
+    public long ord() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void seekExact(long ord) throws IOException {
+      throw new UnsupportedOperationException("By ord lookup not supported.");
+    }
   }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index d0a4c0c4c56b..9474a82bef78 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -30,15 +30,21 @@ record TermsIndex(FST<Long> fst) {
 
   TypeAndOrd getTerm(BytesRef term) throws IOException {
     long encoded = Util.get(fst, term);
-    TermType termType = TermType.fromId((int) ((encoded & 0b1110L) >>> 1));
-    long ord = encoded >>> 4;
-    return new TypeAndOrd(termType, ord);
+    return decodeLong(encoded);
   }
 
   record TypeAndOrd(TermType termType, long ord) {}
 
   void serialize(DataOutput metaOut, DataOutput dataOut) throws IOException {
-    fst.save(metaOut, dataOut);
+    if (fst != null) {
+      fst.save(metaOut, dataOut);
+    }
+  }
+
+  static TypeAndOrd decodeLong(long encoded) {
+    TermType termType = TermType.fromId((int) ((encoded & 0b1110L) >>> 1));
+    long ord = encoded >>> 4;
+    return new TypeAndOrd(termType, ord);
   }
 
   static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap)
@@ -46,7 +52,7 @@ static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOf
     FST<Long> fst;
     if (loadOffHeap) {
       var fstStore = new OffHeapFSTStore();
-      fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton(), fstStore);
+      fst = new FST<>(metaIn, dataIn.clone(), PositiveIntOutputs.getSingleton(), fstStore);
       dataIn.skipBytes(fstStore.size());
     } else {
       fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton());
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
index 0c65f2e04d39..b1881475f74e 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsStats.java
@@ -39,19 +39,27 @@ void serialize(DataOutput output) throws IOException {
     output.writeVLong(sumTotalTermFreq);
     output.writeVLong(sumDocFreq);
     output.writeVInt(docCount);
-    writeBytesRef(output, minTerm);
-    writeBytesRef(output, maxTerm);
+    if (minTerm != null) {
+      writeBytesRef(output, minTerm);
+    }
+    if (maxTerm != null) {
+      writeBytesRef(output, maxTerm);
+    }
   }
 
   static TermsStats deserialize(DataInput input) throws IOException {
+    int fieldNumber = input.readVInt();
+    long size = input.readVLong();
+    long sumTotalTermFreq = input.readVLong();
+    long sumDocFreq = input.readVLong();
+    int docCount = input.readVInt();
+    BytesRef minTerm = null, maxTerm = null;
+    if (size > 0) {
+      minTerm = readBytesRef(input);
+      maxTerm = readBytesRef(input);
+    }
     return new TermsStats(
-        input.readVInt(),
-        input.readVLong(),
-        input.readVLong(),
-        input.readVLong(),
-        input.readVInt(),
-        readBytesRef(input),
-        readBytesRef(input));
+        fieldNumber, size, sumTotalTermFreq, sumDocFreq, docCount, minTerm, maxTerm);
   }
 
   static void writeBytesRef(DataOutput output, BytesRef bytes) throws IOException {
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
index 84704c0b8787..d3a5ab210776 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/bitpacking/BitUnpackerImpl.java
@@ -30,6 +30,10 @@ public long unpack(BytesRef bytesRef, int startBitIndex, int bitWidth) {
     assert (startBitIndex + bitWidth) <= bytesRef.length * 8;
     assert bitWidth < 64;
 
+    if (bitWidth == 0) {
+      return 0;
+    }
+
     int firstByteIndex = startBitIndex / 8;
     int numBitsToExcludeInFirstByte = startBitIndex % 8;
     int lastByteIndex = (startBitIndex + bitWidth) / 8;
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
index d4b1f94aab04..226a4700813c 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
@@ -108,7 +108,17 @@ private static void assertDeserializedMatchingExpected(
       throws IOException {
     RandomAccessTermsDict deserialized =
         RandomAccessTermsDict.deserialize(
-            _fieldNumber -> result.indexOptions(),
+            new RandomAccessTermsDict.IndexOptionsProvider() {
+              @Override
+              public IndexOptions getIndexOptions(int fieldNumber) {
+                return result.indexOptions;
+              }
+
+              @Override
+              public boolean hasPayloads(int fieldNumber) {
+                return result.hasPayloads();
+              }
+            },
             metaInput,
             termIndexInput,
             termDataInputProvider);
@@ -158,9 +168,13 @@ private ExpectedResults indexOneField(
     int fieldNumber = nextFieldNumber++;
     IndexOptions indexOptions =
         IndexOptions.values()[random().nextInt(1, IndexOptions.values().length)];
+    boolean hasPayloads = random().nextBoolean();
+    if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+      hasPayloads = false;
+    }
     RandomAccessTermsDictWriter randomAccessTermsDictWriter =
         new RandomAccessTermsDictWriter(
-            fieldNumber, indexOptions, metaOut, termIndexOut, outputProvider);
+            fieldNumber, indexOptions, hasPayloads, metaOut, termIndexOut, outputProvider);
 
     TermAndState[] expectedTermAndState = getRandoms(1000, 2000);
     int expectedDocCount = random().nextInt(1, 2000);
@@ -169,12 +183,14 @@ private ExpectedResults indexOneField(
       randomAccessTermsDictWriter.add(x.term, x.state);
     }
     randomAccessTermsDictWriter.finish(expectedDocCount);
-    return new ExpectedResults(fieldNumber, indexOptions, expectedTermAndState, expectedDocCount);
+    return new ExpectedResults(
+        fieldNumber, indexOptions, hasPayloads, expectedTermAndState, expectedDocCount);
   }
 
   private record ExpectedResults(
       int fieldNumber,
       IndexOptions indexOptions,
+      boolean hasPayloads,
       TermAndState[] expectedTermAndState,
       int expectedDocCount) {}
 
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
index 6b316330ecad..e0cd887c10c6 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
@@ -73,7 +73,7 @@ public void testWriterAndDeserialize() throws IOException {
       }
       ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes());
       ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata);
-      TermData expected = new TermData(expectedMetadataSlice, expectedDataSlice);
+      TermData expected = new TermData(() -> expectedMetadataSlice, () -> expectedDataSlice);
 
       IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT);
       IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT);
@@ -81,13 +81,15 @@ public void testWriterAndDeserialize() throws IOException {
 
       TermData actual =
           TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
-      assertByteSlice(expected.metadata(), actual.metadata());
-      assertByteSlice(expected.data(), actual.data());
+      assertByteSlice(
+          expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice());
+      assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice());
       testDecodeTermState(testFixture, actual);
 
       actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
-      assertByteSlice(expected.metadata(), actual.metadata());
-      assertByteSlice(expected.data(), actual.data());
+      assertByteSlice(
+          expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice());
+      assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice());
       testDecodeTermState(testFixture, actual);
 
       metaIn.close();
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
index db7630f1f35a..f9d1c416cda7 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermStateCodecImpl.java
@@ -176,16 +176,24 @@ public void testGetCodec() {
             && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
           continue;
         }
-        TermType termType = TermType.fromId(i);
-        var expected = getExpectedCodec(termType, indexOptions);
-        var got = TermStateCodecImpl.getCodec(termType, indexOptions);
-        assertEquals(expected, got);
+        for (int dice = 0; dice < 2; dice++) {
+          boolean hasPayloads = dice == 0;
+          if (hasPayloads
+              && indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
+            continue;
+          }
+          TermType termType = TermType.fromId(i);
+          var expected = getExpectedCodec(termType, indexOptions, hasPayloads);
+          var got = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads);
+          assertEquals(expected, got);
+        }
       }
     }
   }
 
   // Enumerate the expected Codec we get for (TermType, IndexOptions) pairs.
-  static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions indexOptions) {
+  static TermStateCodecImpl getExpectedCodec(
+      TermType termType, IndexOptions indexOptions, boolean hasPayloads) {
     ArrayList<TermStateCodecComponent> components = new ArrayList<>();
     // Wish I can code this better in java...
     switch (termType.getId()) {
@@ -201,6 +209,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
         }
         if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
           components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+          if (hasPayloads) {
+            components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+          }
         }
         if (indexOptions.ordinal()
             >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
@@ -220,6 +231,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
         }
         if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
           components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+          if (hasPayloads) {
+            components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+          }
         }
         if (indexOptions.ordinal()
             >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
@@ -240,6 +254,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
         }
         if (indexOptions.ordinal() >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.ordinal()) {
           components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+          if (hasPayloads) {
+            components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+          }
         }
         if (indexOptions.ordinal()
             >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
@@ -262,6 +279,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
         components.add(TermStateCodecComponent.DocFreq.INSTANCE);
         components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
         components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        if (hasPayloads) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
         components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE);
         if (indexOptions.ordinal()
             >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
@@ -277,6 +297,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
         components.add(TermStateCodecComponent.SingletonDocId.INSTANCE);
         components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
         components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        if (hasPayloads) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
         components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE);
         if (indexOptions.ordinal()
             >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {
@@ -294,6 +317,9 @@ static TermStateCodecImpl getExpectedCodec(TermType termType, IndexOptions index
         components.add(TermStateCodecComponent.SkipOffset.INSTANCE);
         components.add(TermStateCodecComponent.TotalTermFreq.INSTANCE);
         components.add(TermStateCodecComponent.PositionStartFP.INSTANCE);
+        if (hasPayloads) {
+          components.add(TermStateCodecComponent.PayloadStartFP.INSTANCE);
+        }
         components.add(TermStateCodecComponent.LastPositionBlockOffset.INSTANCE);
         if (indexOptions.ordinal()
             >= IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.ordinal()) {

From ea572a31bd42c5f3a8901540e33afe63f9f71ba9 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Tue, 21 Nov 2023 23:54:34 -0800
Subject: [PATCH 36/57] Fix bugs found in tests

1. handle terms with no docs (due to deletes)
2. carefully handle create/open index files to make sure not files left unclosed even
    in case of errors.
---
 .../Lucene99RandomAccessTermsReader.java      |  57 +++++--
 .../Lucene99RandomAccessTermsWriter.java      | 144 ++++++++++--------
 .../lucene99/randomaccess/TermsImpl.java      |   3 -
 3 files changed, 118 insertions(+), 86 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
index ab3285af297f..4079b0e5d779 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsReader.java
@@ -22,6 +22,7 @@
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import org.apache.lucene.codecs.CodecUtil;
@@ -44,14 +45,26 @@ final class Lucene99RandomAccessTermsReader extends FieldsProducer {
 
   private final HashMap<String, TermsImpl> perFieldTermDict;
 
+  private boolean closed;
+
   Lucene99RandomAccessTermsReader(
       Lucene99PostingsReader postingsReader, SegmentReadState segmentReadState) throws IOException {
     this.postingsReader = postingsReader;
     this.segmentReadState = segmentReadState;
-    this.indexFilesManager = new IndexFilesManager();
     this.perFieldTermDict = new HashMap<>();
     boolean success = false;
+    IndexFilesManager tmpIndexFilesManager = null;
     try {
+      boolean indexManagerInitSuccess = false;
+      try {
+        tmpIndexFilesManager = new IndexFilesManager();
+        this.indexFilesManager = tmpIndexFilesManager;
+        indexManagerInitSuccess = true;
+      } finally {
+        if (!indexManagerInitSuccess) {
+          IOUtils.closeWhileHandlingException(tmpIndexFilesManager);
+        }
+      }
       int numFields = indexFilesManager.metaInfoIn.readVInt();
       assert numFields > 0;
       for (int i = 0; i < numFields; i++) {
@@ -71,12 +84,15 @@ public boolean hasPayloads(int fieldNumber) {
                 indexFilesManager.metaInfoIn,
                 indexFilesManager.termIndexIn,
                 indexFilesManager);
-        FieldInfo fieldInfo =
-            segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber());
-        String fieldName = fieldInfo.name;
-        perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict, postingsReader));
-        success = true;
+
+        if (termsDict.termsStats().size() > 0) {
+          FieldInfo fieldInfo =
+              segmentReadState.fieldInfos.fieldInfo(termsDict.termsStats().fieldNumber());
+          String fieldName = fieldInfo.name;
+          perFieldTermDict.put(fieldName, new TermsImpl(fieldInfo, termsDict, postingsReader));
+        }
       }
+      success = true;
     } finally {
       if (!success) {
         IOUtils.closeWhileHandlingException(this);
@@ -86,11 +102,15 @@ public boolean hasPayloads(int fieldNumber) {
 
   @Override
   public void close() throws IOException {
+    if (closed) {
+      return;
+    }
     try {
       IOUtils.close(indexFilesManager, postingsReader);
     } finally {
       // The per-field term dictionary would be invalid once the underlying index files have been
       // closed.
+      closed = true;
       perFieldTermDict.clear();
     }
   }
@@ -116,22 +136,25 @@ public int size() {
   }
 
   class IndexFilesManager implements RandomAccessTermsDict.TermDataInputProvider, Closeable {
-    private final IndexInput metaInfoIn;
+    private IndexInput metaInfoIn;
 
-    private final IndexInput termIndexIn;
+    private IndexInput termIndexIn;
 
     private final HashMap<TermType, RandomAccessTermsDict.TermDataInput> termDataInputPerType;
 
+    private boolean closed;
+
+    private final ArrayList<IndexInput> openedInputs;
+
     public IndexFilesManager() throws IOException {
+      termDataInputPerType = new HashMap<>();
+      openedInputs = new ArrayList<>();
       metaInfoIn = initMetaInfoInput();
       termIndexIn = initTermIndexInput();
-      termDataInputPerType = new HashMap<>();
     }
 
     private IndexInput initMetaInfoInput() throws IOException {
-      final IndexInput tmp;
-      tmp = openAndChecksumIndexInputSafe(TERM_DICT_META_INFO_EXTENSION, false);
-
+      final IndexInput tmp = openAndChecksumIndexInputSafe(TERM_DICT_META_INFO_EXTENSION, false);
       checkHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME);
       postingsReader.init(tmp, segmentReadState);
       postingsReader.checkIntegrity();
@@ -174,10 +197,11 @@ private IndexInput openAndChecksumIndexInputSafe(
         input =
             segmentReadState.directory.openInput(
                 name, needRandomAccess ? IOContext.LOAD : IOContext.READ);
+        openedInputs.add(input);
         success = true;
       } finally {
         if (!success) {
-          IOUtils.closeWhileHandlingException(input);
+          IOUtils.closeWhileHandlingException(input, this);
         }
       }
       CodecUtil.checksumEntireFile(input);
@@ -207,10 +231,11 @@ public RandomAccessTermsDict.TermDataInput getTermDataInputForType(TermType term
 
     @Override
     public void close() throws IOException {
-      IOUtils.close(metaInfoIn, termIndexIn);
-      for (var x : termDataInputPerType.values()) {
-        IOUtils.close(x.metadataInput(), x.dataInput());
+      if (this.closed) {
+        return;
       }
+      this.closed = true;
+      IOUtils.close(openedInputs);
     }
   }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
index bc6aebf1a8de..3fd7fdcf111c 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermsWriter.java
@@ -20,6 +20,7 @@
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsConsumer;
@@ -53,7 +54,17 @@ public Lucene99RandomAccessTermsWriter(
       throws IOException {
     this.segmentWriteState = segmentWriteState;
     this.postingsWriter = postingsWriter;
-    this.indexFilesManager = new IndexFilesManager();
+    IndexFilesManager tmpIndexFilesManager = null;
+    boolean indexManagerInitSuccess = false;
+    try {
+      tmpIndexFilesManager = new IndexFilesManager();
+      this.indexFilesManager = tmpIndexFilesManager;
+      indexManagerInitSuccess = true;
+    } finally {
+      if (!indexManagerInitSuccess) {
+        IOUtils.closeWhileHandlingException(tmpIndexFilesManager, this);
+      }
+    }
   }
 
   @Override
@@ -65,37 +76,48 @@ public void write(Fields fields, NormsProducer norms) throws IOException {
         nonEmptyFields.put(field, terms);
       }
     }
-    indexFilesManager.metaInfoOut.writeVInt(nonEmptyFields.size());
-
-    FixedBitSet docSeen = new FixedBitSet(segmentWriteState.segmentInfo.maxDoc());
-    for (var entry : nonEmptyFields.entrySet()) {
-      TermsEnum termsEnum = entry.getValue().iterator();
-      FieldInfo fieldInfo = segmentWriteState.fieldInfos.fieldInfo(entry.getKey());
-      RandomAccessTermsDictWriter termsDictWriter =
-          new RandomAccessTermsDictWriter(
-              fieldInfo.number,
-              fieldInfo.getIndexOptions(),
-              fieldInfo.hasPayloads(),
-              indexFilesManager.metaInfoOut,
-              indexFilesManager.termIndexOut,
-              indexFilesManager);
-      postingsWriter.setField(fieldInfo);
-
-      docSeen.clear();
-      while (true) {
-        BytesRef term = termsEnum.next();
-        if (term == null) {
-          break;
-        }
+    boolean success = false;
+    try {
+      indexFilesManager.writeAllHeaders();
+      postingsWriter.init(indexFilesManager.metaInfoOut, segmentWriteState);
+      indexFilesManager.metaInfoOut.writeVInt(nonEmptyFields.size());
+
+      FixedBitSet docSeen = new FixedBitSet(segmentWriteState.segmentInfo.maxDoc());
+      for (var entry : nonEmptyFields.entrySet()) {
+        TermsEnum termsEnum = entry.getValue().iterator();
+        FieldInfo fieldInfo = segmentWriteState.fieldInfos.fieldInfo(entry.getKey());
+        RandomAccessTermsDictWriter termsDictWriter =
+            new RandomAccessTermsDictWriter(
+                fieldInfo.number,
+                fieldInfo.getIndexOptions(),
+                fieldInfo.hasPayloads(),
+                indexFilesManager.metaInfoOut,
+                indexFilesManager.termIndexOut,
+                indexFilesManager);
+        postingsWriter.setField(fieldInfo);
+
+        docSeen.clear();
+        while (true) {
+          BytesRef term = termsEnum.next();
+          if (term == null) {
+            break;
+          }
 
-        IntBlockTermState termState =
-            (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms);
-        // TermState can be null
-        if (termState != null) {
-          termsDictWriter.add(term, termState);
+          IntBlockTermState termState =
+              (IntBlockTermState) postingsWriter.writeTerm(term, termsEnum, docSeen, norms);
+          // TermState can be null
+          if (termState != null) {
+            termsDictWriter.add(term, termState);
+          }
         }
+        termsDictWriter.finish(docSeen.cardinality());
+      }
+      indexFilesManager.writeAllFooters();
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this);
       }
-      termsDictWriter.finish(docSeen.cardinality());
     }
   }
 
@@ -121,25 +143,24 @@ class IndexFilesManager implements RandomAccessTermsDictWriter.TermDataOutputPro
 
     private final HashMap<TermType, TermDataOutput> termDataOutputPerType;
 
+    private boolean closed;
+
+    private final ArrayList<IndexOutput> openedOutputs;
+
     public IndexFilesManager() throws IOException {
-      metaInfoOut = initMetaInfoOutput();
-      termIndexOut = initTermIndexOutput();
       // populate the per-TermType term data outputs on-demand.
       termDataOutputPerType = new HashMap<>();
+      openedOutputs = new ArrayList<>();
+      metaInfoOut = initMetaInfoOutput();
+      termIndexOut = initTermIndexOutput();
     }
 
     private IndexOutput initMetaInfoOutput() throws IOException {
-      final IndexOutput tmp;
-      tmp = getIndexOutputSafe(TERM_DICT_META_INFO_EXTENSION);
-      writeHeader(tmp, TERM_DICT_META_HEADER_CODEC_NAME);
-      postingsWriter.init(tmp, segmentWriteState);
-      return tmp;
+      return getIndexOutputSafe(TERM_DICT_META_INFO_EXTENSION);
     }
 
     private IndexOutput initTermIndexOutput() throws IOException {
-      final IndexOutput tmp = getIndexOutputSafe(TERM_INDEX_EXTENSION);
-      writeHeader(tmp, TERM_INDEX_HEADER_CODEC_NAME);
-      return tmp;
+      return getIndexOutputSafe(TERM_INDEX_EXTENSION);
     }
 
     private TermDataOutput initTermDataOutput(TermType termType) throws IOException {
@@ -169,6 +190,7 @@ private IndexOutput getIndexOutputSafe(String segmentLocalName) throws IOExcepti
       IndexOutput output = null;
       try {
         output = segmentWriteState.directory.createOutput(name, segmentWriteState.context);
+        openedOutputs.add(output);
         success = true;
       } finally {
         if (!success) {
@@ -187,6 +209,17 @@ private void writeHeader(IndexOutput output, String headerName) throws IOExcepti
           segmentWriteState.segmentSuffix);
     }
 
+    private void writeAllHeaders() throws IOException {
+      writeHeader(metaInfoOut, TERM_DICT_META_HEADER_CODEC_NAME);
+      writeHeader(termIndexOut, TERM_INDEX_HEADER_CODEC_NAME);
+    }
+
+    private void writeAllFooters() throws IOException {
+      for (var x : openedOutputs) {
+        CodecUtil.writeFooter(x);
+      }
+    }
+
     @Override
     public TermDataOutput getTermDataOutputForType(TermType termType) throws IOException {
       TermDataOutput current = termDataOutputPerType.get(termType);
@@ -197,36 +230,13 @@ public TermDataOutput getTermDataOutputForType(TermType termType) throws IOExcep
       return current;
     }
 
-    /**
-     * Write footers for all created index files and close them.
-     *
-     * <p>Assume all index files are valid upto time of calling.
-     */
+    @Override
     public void close() throws IOException {
-      boolean success = false;
-      try {
-        CodecUtil.writeFooter(metaInfoOut);
-        CodecUtil.writeFooter(termIndexOut);
-        for (var termDataOutput : termDataOutputPerType.values()) {
-          CodecUtil.writeFooter(termDataOutput.metadataOutput());
-          CodecUtil.writeFooter(termDataOutput.dataOutput());
-        }
-        success = true;
-      } finally {
-        if (success) {
-          IOUtils.close(metaInfoOut, termIndexOut);
-          for (var termDataOutput : termDataOutputPerType.values()) {
-            IOUtils.close(termDataOutput.metadataOutput());
-            IOUtils.close(termDataOutput.dataOutput());
-          }
-        } else {
-          IOUtils.closeWhileHandlingException(metaInfoOut, termIndexOut);
-          for (var termDataOutput : termDataOutputPerType.values()) {
-            IOUtils.closeWhileHandlingException(
-                termDataOutput.metadataOutput(), termDataOutput.dataOutput());
-          }
-        }
+      if (this.closed) {
+        return;
       }
+      this.closed = true;
+      IOUtils.close(openedOutputs);
     }
   }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index 36e861aaf6f8..c4a7aff819c7 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -101,9 +101,6 @@ public BytesRef getMax() throws IOException {
 
   @Override
   public TermsEnum iterator() throws IOException {
-    if (size() == 0) {
-      return TermsEnum.EMPTY;
-    }
     return new RandomAccessTermsEnum();
   }
 

From 5a8efd34de3a235ba02f85561247816fa2eaa6cf Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Wed, 22 Nov 2023 00:49:57 -0800
Subject: [PATCH 37/57] Reduce index cloning calls when looking up terms

Only clone per-TermEnum.

Note: I had to increase the threshold as this PostingsFormat has more index files per segement
Before: java.lang.AssertionError: too many calls to IndexInput.clone during TermRangeQuery: 2878
After: 70
---
 .../randomaccess/ByteSliceProvider.java       | 26 +++++++
 .../randomaccess/RandomAccessTermsDict.java   |  5 +-
 .../lucene99/randomaccess/TermData.java       | 55 +--------------
 .../randomaccess/TermDataProvider.java        | 67 +++++++++++++++++++
 .../lucene99/randomaccess/TermDataReader.java | 42 +++++++++---
 .../lucene99/randomaccess/TermsImpl.java      | 11 ++-
 .../TestRandomAccessTermsDictWriter.java      |  3 +-
 .../randomaccess/TestTermDataWriter.java      | 31 +++++----
 .../lucene/index/TestForTooMuchCloning.java   |  2 +-
 9 files changed, 158 insertions(+), 84 deletions(-)
 create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java
 create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java
new file mode 100644
index 000000000000..7d18abc5e0a4
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSliceProvider.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+
+/** Factory of {@link ByteSlice} */
+@FunctionalInterface
+interface ByteSliceProvider {
+  ByteSlice newByteSlice() throws IOException;
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
index 1d1c3e194f40..712c832d93c5 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
@@ -31,9 +31,10 @@ record RandomAccessTermsDict(
     TermDataReader termDataReader,
     IndexOptions indexOptions) {
 
-  IntBlockTermState getTermState(BytesRef term) throws IOException {
+  IntBlockTermState getTermState(BytesRef term, TermData[] termDataPerType) throws IOException {
     TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term);
-    return termDataReader.getTermState(typeAndOrd.termType(), typeAndOrd.ord(), indexOptions);
+    return termDataReader.getTermState(
+        typeAndOrd.termType(), typeAndOrd.ord(), indexOptions, termDataPerType);
   }
 
   static RandomAccessTermsDict deserialize(
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
index 3860ba1a3f4b..6eba1a0cd51c 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
@@ -20,20 +20,14 @@
 import java.io.IOException;
 import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpackerImpl;
-import org.apache.lucene.store.DataInput;
-import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.BytesRef;
 
 /**
  * Holds the bit-packed {@link IntBlockTermState} for a given {@link
  * org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermType}
  */
-record TermData(ByteSliceProvider metadataProvider, ByteSliceProvider dataProvider) {
-
+record TermData(ByteSlice metadata, ByteSlice data) {
   IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOException {
-    var metadata = metadataProvider.newByteSlice();
-    var data = dataProvider.newByteSlice();
-
     long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK;
     long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8);
     long dataStartPos = metadata.getLong(metadataStartPos);
@@ -52,51 +46,4 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio
 
     return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex);
   }
-
-  static TermData deserializeOnHeap(
-      DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException {
-    long metadataSize = metaInput.readVLong();
-    long dataSize = metaInput.readVLong();
-
-    if (metadataSize > Integer.MAX_VALUE) {
-      throw new IllegalArgumentException(
-          "Metadata size it too large to store on heap. Must be less than " + Integer.MAX_VALUE);
-    }
-    if (dataSize > Integer.MAX_VALUE) {
-      throw new IllegalArgumentException(
-          "Data size it too large to store on heap.Must be less than " + Integer.MAX_VALUE);
-    }
-
-    byte[] metadataBytes = new byte[(int) metadataSize];
-    byte[] dataBytes = new byte[(int) dataSize];
-
-    metadataInput.readBytes(metadataBytes, 0, metadataBytes.length);
-    dataInput.readBytes(dataBytes, 0, dataBytes.length);
-
-    return new TermData(
-        () -> new ByteArrayByteSlice(metadataBytes), () -> new ByteArrayByteSlice(dataBytes));
-  }
-
-  static TermData deserializeOffHeap(
-      DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException {
-    final long metadataSize = metaInput.readVLong();
-    final long dataSize = metaInput.readVLong();
-
-    final long metadataStart = metadataInput.getFilePointer();
-    final long dataStart = dataInput.getFilePointer();
-
-    metadataInput.skipBytes(metadataSize);
-    dataInput.skipBytes(dataSize);
-
-    return new TermData(
-        () ->
-            new RandomAccessInputByteSlice(
-                metadataInput.randomAccessSlice(metadataStart, metadataSize)),
-        () -> new RandomAccessInputByteSlice(dataInput.randomAccessSlice(dataStart, dataSize)));
-  }
-
-  @FunctionalInterface
-  interface ByteSliceProvider {
-    ByteSlice newByteSlice() throws IOException;
-  }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java
new file mode 100644
index 000000000000..130094016c5d
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataProvider.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+
+/** Factory class to produce instances of TermData */
+record TermDataProvider(ByteSliceProvider metadataProvider, ByteSliceProvider dataProvider) {
+  static TermDataProvider deserializeOnHeap(
+      DataInput metaInput, DataInput metadataInput, DataInput dataInput) throws IOException {
+    long metadataSize = metaInput.readVLong();
+    long dataSize = metaInput.readVLong();
+
+    if (metadataSize > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException(
+          "Metadata size it too large to store on heap. Must be less than " + Integer.MAX_VALUE);
+    }
+    if (dataSize > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException(
+          "Data size it too large to store on heap.Must be less than " + Integer.MAX_VALUE);
+    }
+
+    byte[] metadataBytes = new byte[(int) metadataSize];
+    byte[] dataBytes = new byte[(int) dataSize];
+
+    metadataInput.readBytes(metadataBytes, 0, metadataBytes.length);
+    dataInput.readBytes(dataBytes, 0, dataBytes.length);
+
+    return new TermDataProvider(
+        () -> new ByteArrayByteSlice(metadataBytes), () -> new ByteArrayByteSlice(dataBytes));
+  }
+
+  static TermDataProvider deserializeOffHeap(
+      DataInput metaInput, IndexInput metadataInput, IndexInput dataInput) throws IOException {
+    final long metadataSize = metaInput.readVLong();
+    final long dataSize = metaInput.readVLong();
+
+    final long metadataStart = metadataInput.getFilePointer();
+    final long dataStart = dataInput.getFilePointer();
+
+    metadataInput.skipBytes(metadataSize);
+    dataInput.skipBytes(dataSize);
+
+    return new TermDataProvider(
+        () ->
+            new RandomAccessInputByteSlice(
+                metadataInput.randomAccessSlice(metadataStart, metadataSize)),
+        () -> new RandomAccessInputByteSlice(dataInput.randomAccessSlice(dataStart, dataSize)));
+  }
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
index fd5a44fc76b1..7d9b701f9ab1 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
@@ -27,13 +27,17 @@
  * Holds all {@link TermData} per {@link TermType} for a field. Also manages the proper codec needed
  * per TermType.
  */
-record TermDataReader(TermDataAndCodec[] termDataAndCodecs) {
+record TermDataReader(TermDataProviderAndCodec[] termDataProviderAndCodecs) {
 
-  IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions)
+  IntBlockTermState getTermState(
+      TermType termType, long ord, IndexOptions indexOptions, TermData[] termDataPerType)
       throws IOException {
-    assert termDataAndCodecs[termType.getId()] != null;
-    var dataAndCodec = termDataAndCodecs[termType.getId()];
-    IntBlockTermState termState = dataAndCodec.termData.getTermState(dataAndCodec.codec, ord);
+    assert termDataProviderAndCodecs[termType.getId()] != null;
+    assert termDataPerType.length == termDataProviderAndCodecs.length;
+    assert termDataPerType[termType.getId()] != null;
+
+    var codec = termDataProviderAndCodecs[termType.getId()].codec;
+    IntBlockTermState termState = termDataPerType[termType.getId()].getTermState(codec, ord);
 
     // need to filling some default values for the term state
     // in order to meet the expectations of the postings reader
@@ -61,10 +65,26 @@ IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOp
     return termState;
   }
 
+  TermData[] newPerTypeTermDataReference() throws IOException {
+    TermData[] result = new TermData[termDataProviderAndCodecs.length];
+    for (int i = 0; i < result.length; i++) {
+      if (termDataProviderAndCodecs[i] == null) {
+        continue;
+      }
+      TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider;
+      result[i] =
+          new TermData(
+              termDataProvider.metadataProvider().newByteSlice(),
+              termDataProvider.dataProvider().newByteSlice());
+    }
+    return result;
+  }
+
   static class Builder {
     final IndexOptions indexOptions;
     final boolean hasPayloads;
-    final TermDataAndCodec[] termDataAndCodecs = new TermDataAndCodec[TermType.NUM_TOTAL_TYPES];
+    final TermDataProviderAndCodec[] termDataProviderAndCodecs =
+        new TermDataProviderAndCodec[TermType.NUM_TOTAL_TYPES];
 
     Builder(IndexOptions indexOptions, boolean hasPayloads) {
       this.indexOptions = indexOptions;
@@ -74,15 +94,17 @@ static class Builder {
     void readOne(
         TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn)
         throws IOException {
-      TermData termData = TermData.deserializeOffHeap(metaIn, termMetadataIn, termDataIn);
+      TermDataProvider termDataProvider =
+          TermDataProvider.deserializeOffHeap(metaIn, termMetadataIn, termDataIn);
       TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads);
-      termDataAndCodecs[termType.getId()] = new TermDataAndCodec(termData, codec);
+      termDataProviderAndCodecs[termType.getId()] =
+          new TermDataProviderAndCodec(termDataProvider, codec);
     }
 
     TermDataReader build() {
-      return new TermDataReader(termDataAndCodecs);
+      return new TermDataReader(termDataProviderAndCodecs);
     }
   }
 
-  record TermDataAndCodec(TermData termData, TermStateCodec codec) {}
+  record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {}
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index c4a7aff819c7..c521cba8a8ce 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -129,9 +129,12 @@ final class RandomAccessTermsEnum extends TermsEnum {
     // We need to re-seek in next() calls to catch up to that term.
     private boolean needReSeekInNext;
 
-    RandomAccessTermsEnum() {
+    private TermData[] perTypeTermData;
+
+    RandomAccessTermsEnum() throws IOException {
       termState = (IntBlockTermState) lucene99PostingsReader.newTermState();
       fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst());
+      perTypeTermData = termsDict.termDataReader().newPerTypeTermDataReference();
     }
 
     void updateTermStateIfNeeded() throws IOException {
@@ -140,7 +143,11 @@ void updateTermStateIfNeeded() throws IOException {
         termState =
             termsDict
                 .termDataReader()
-                .getTermState(typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions());
+                .getTermState(
+                    typeAndOrd.termType(),
+                    typeAndOrd.ord(),
+                    fieldInfo.getIndexOptions(),
+                    perTypeTermData);
         isTermStateCurrent = true;
       }
     }
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
index 226a4700813c..b02d4de0cebf 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
@@ -138,9 +138,10 @@ public boolean hasPayloads(int fieldNumber) {
         result.expectedTermAndState()[result.expectedTermAndState().length - 1].term,
         deserialized.termsStats().maxTerm());
 
+    TermData[] perTypeTermData = deserialized.termDataReader().newPerTypeTermDataReference();
     for (var x : result.expectedTermAndState()) {
       IntBlockTermState expectedState = x.state;
-      IntBlockTermState actualState = deserialized.getTermState(x.term);
+      IntBlockTermState actualState = deserialized.getTermState(x.term, perTypeTermData);
       if (expectedState.singletonDocID != -1) {
         assertEquals(expectedState.singletonDocID, actualState.singletonDocID);
       } else {
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
index e0cd887c10c6..fc1b7b0f269b 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermDataWriter.java
@@ -73,24 +73,23 @@ public void testWriterAndDeserialize() throws IOException {
       }
       ByteSlice expectedDataSlice = new ByteArrayByteSlice(referenceBitPacker.getCompactBytes());
       ByteSlice expectedMetadataSlice = new ByteArrayByteSlice(expectedMetadata);
-      TermData expected = new TermData(() -> expectedMetadataSlice, () -> expectedDataSlice);
+      TermData expected = new TermData(expectedMetadataSlice, expectedDataSlice);
 
       IndexInput metaIn = testDir.openInput("segment_meta", IOContext.DEFAULT);
       IndexInput metadataIn = testDir.openInput("term_meta_1", IOContext.DEFAULT);
       IndexInput dataIn = testDir.openInput("term_data_11", IOContext.DEFAULT);
 
-      TermData actual =
-          TermData.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
-      assertByteSlice(
-          expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice());
-      assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice());
-      testDecodeTermState(testFixture, actual);
+      TermDataProvider actualProvider =
+          TermDataProvider.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
+      assertByteSlice(expected.metadata(), actualProvider.metadataProvider().newByteSlice());
+      assertByteSlice(expected.data(), actualProvider.dataProvider().newByteSlice());
+      testDecodeTermState(testFixture, actualProvider);
 
-      actual = TermData.deserializeOffHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
-      assertByteSlice(
-          expected.metadataProvider().newByteSlice(), actual.metadataProvider().newByteSlice());
-      assertByteSlice(expected.dataProvider().newByteSlice(), actual.dataProvider().newByteSlice());
-      testDecodeTermState(testFixture, actual);
+      actualProvider =
+          TermDataProvider.deserializeOnHeap(metaIn.clone(), metadataIn.clone(), dataIn.clone());
+      assertByteSlice(expected.metadata(), actualProvider.metadataProvider().newByteSlice());
+      assertByteSlice(expected.data(), actualProvider.dataProvider().newByteSlice());
+      testDecodeTermState(testFixture, actualProvider);
 
       metaIn.close();
       metadataIn.close();
@@ -98,8 +97,12 @@ public void testWriterAndDeserialize() throws IOException {
     }
   }
 
-  private static void testDecodeTermState(TermStateTestFixture testFixture, TermData actual)
-      throws IOException {
+  private static void testDecodeTermState(
+      TermStateTestFixture testFixture, TermDataProvider actualProvider) throws IOException {
+    TermData actual =
+        new TermData(
+            actualProvider.metadataProvider().newByteSlice(),
+            actualProvider.dataProvider().newByteSlice());
     for (int i = 0; i < testFixture.termStatesArray().length; i++) {
       IntBlockTermState expectedTermState = testFixture.termStatesArray()[i];
       IntBlockTermState decoded = actual.getTermState(testFixture.codec(), i);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java
index 7c72b3d2e76a..97454969be90 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java
@@ -80,7 +80,7 @@ public void test() throws Exception {
     // System.out.println("query clone count=" + queryCloneCount);
     assertTrue(
         "too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount,
-        queryCloneCount < 50);
+        queryCloneCount < 100);
     r.close();
     dir.close();
   }

From 8ab91393fe7e20919ed5bb3c5536eecfe161c5ba Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Wed, 22 Nov 2023 10:19:17 -0800
Subject: [PATCH 38/57] Add Lucene99RandomAccessTermDictCodec

---
 lucene/codecs/src/java/module-info.java       |  3 +-
 .../Lucene99RandomAccessTermDictCodec.java    | 42 +++++++++++++++++++
 .../services/org.apache.lucene.codecs.Codec   |  1 +
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java

diff --git a/lucene/codecs/src/java/module-info.java b/lucene/codecs/src/java/module-info.java
index a128950ddb56..b7e75f5917c0 100644
--- a/lucene/codecs/src/java/module-info.java
+++ b/lucene/codecs/src/java/module-info.java
@@ -39,5 +39,6 @@
       org.apache.lucene.sandbox.codecs.lucene99.randomaccess
           .Lucene99RandomAccessDictionaryPostingsFormat;
   provides org.apache.lucene.codecs.Codec with
-      org.apache.lucene.codecs.simpletext.SimpleTextCodec;
+      org.apache.lucene.codecs.simpletext.SimpleTextCodec,
+      org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessTermDictCodec;
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java
new file mode 100644
index 000000000000..edb6265c974a
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99Codec;
+import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+
+public class Lucene99RandomAccessTermDictCodec extends FilterCodec {
+  private final Lucene99RandomAccessDictionaryPostingsFormat lucene99RandomAccessPostingsFormat =
+      new Lucene99RandomAccessDictionaryPostingsFormat();
+
+  public Lucene99RandomAccessTermDictCodec() {
+    super("Lucene99RandomAccessTermDict", new Lucene99Codec());
+  }
+
+  @Override
+  public PostingsFormat postingsFormat() {
+    return new PerFieldPostingsFormat() {
+      @Override
+      public PostingsFormat getPostingsFormatForField(String field) {
+        return lucene99RandomAccessPostingsFormat;
+      }
+    };
+  }
+}
diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index fcd5ded3605c..bf0e25322963 100644
--- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -14,3 +14,4 @@
 #  limitations under the License.
 
 org.apache.lucene.codecs.simpletext.SimpleTextCodec
+org.apache.lucene.sandbox.codecs.lucene99.randomaccess.Lucene99RandomAccessTermDictCodec

From 10d4181cd837afe3fcd8e837cc16bc90e70ae25b Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Wed, 22 Nov 2023 10:35:04 -0800
Subject: [PATCH 39/57] Fix build after mering from apache:main

---
 .../randomaccess/RandomAccessTermsDictWriter.java         | 6 ++++--
 .../sandbox/codecs/lucene99/randomaccess/TermsIndex.java  | 8 ++++++--
 .../codecs/lucene99/randomaccess/TermsIndexBuilder.java   | 7 ++++---
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
index 6a8a4a6a5f74..5002f81c03ea 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDictWriter.java
@@ -40,7 +40,7 @@ final class RandomAccessTermsDictWriter {
   private final TermDataOutput[] termDataOutputPerType =
       new TermDataOutput[TermType.NUM_TOTAL_TYPES];
 
-  private final TermsIndexBuilder termsIndexBuilder = new TermsIndexBuilder();
+  private final TermsIndexBuilder termsIndexBuilder;
 
   private final TermDataWriter[] termDataWriterPerType =
       new TermDataWriter[TermType.NUM_TOTAL_TYPES];
@@ -55,13 +55,15 @@ final class RandomAccessTermsDictWriter {
       boolean hasPayloads,
       DataOutput metaOutput,
       DataOutput indexOutput,
-      TermDataOutputProvider termDataOutputProvider) {
+      TermDataOutputProvider termDataOutputProvider)
+      throws IOException {
     this.indexOptions = indexOptions;
     this.hasPayloads = hasPayloads;
     this.metaOutput = metaOutput;
     this.indexOutput = indexOutput;
     this.termDataOutputProvider = termDataOutputProvider;
     this.termStatsTracker = new TermStatsTracker(filedNumber);
+    this.termsIndexBuilder = new TermsIndexBuilder();
   }
 
   void add(BytesRef term, IntBlockTermState termState) throws IOException {
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
index 9474a82bef78..a802026f9cb2 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndex.java
@@ -52,10 +52,14 @@ static TermsIndex deserialize(DataInput metaIn, DataInput dataIn, boolean loadOf
     FST<Long> fst;
     if (loadOffHeap) {
       var fstStore = new OffHeapFSTStore();
-      fst = new FST<>(metaIn, dataIn.clone(), PositiveIntOutputs.getSingleton(), fstStore);
+      fst =
+          new FST<>(
+              FST.readMetadata(metaIn, PositiveIntOutputs.getSingleton()),
+              dataIn.clone(),
+              fstStore);
       dataIn.skipBytes(fstStore.size());
     } else {
-      fst = new FST<>(metaIn, dataIn, PositiveIntOutputs.getSingleton());
+      fst = new FST<>(FST.readMetadata(metaIn, PositiveIntOutputs.getSingleton()), dataIn);
     }
     return new TermsIndex(fst);
   }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
index d142420d4470..35dd42e81cd5 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
@@ -34,10 +34,11 @@ final class TermsIndexBuilder {
   private static final long MAX_ORD = (1L << 60) - 1;
 
   private final long[] countPerType = new long[TermType.NUM_TOTAL_TYPES];
-  private final FSTCompiler<Long> fstCompiler =
-      new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build();
+  private final FSTCompiler<Long> fstCompiler;
 
-  TermsIndexBuilder() {
+  TermsIndexBuilder() throws IOException {
+    fstCompiler =
+        new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build();
     Arrays.fill(countPerType, -1);
   }
 

From ac1b77ff167bbb3e7967c5f78bbab3a8e1123917 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Wed, 22 Nov 2023 11:15:09 -0800
Subject: [PATCH 40/57] Add missing javadoc

---
 .../randomaccess/Lucene99RandomAccessTermDictCodec.java       | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java
index edb6265c974a..255da4ed80cb 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/Lucene99RandomAccessTermDictCodec.java
@@ -22,6 +22,10 @@
 import org.apache.lucene.codecs.lucene99.Lucene99Codec;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
+/**
+ * A Codec that uses {@link Lucene99RandomAccessDictionaryPostingsFormat} on top of {@link
+ * Lucene99Codec}
+ */
 public class Lucene99RandomAccessTermDictCodec extends FilterCodec {
   private final Lucene99RandomAccessDictionaryPostingsFormat lucene99RandomAccessPostingsFormat =
       new Lucene99RandomAccessDictionaryPostingsFormat();

From aa0074d9139114179e6d2e271d5382eb3d856d20 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Fri, 24 Nov 2023 20:17:32 -0800
Subject: [PATCH 41/57] Optimize for less allocation

---
 .../randomaccess/ByteArrayByteSlice.java      |   8 ++
 .../lucene99/randomaccess/ByteSlice.java      |   2 +
 .../RandomAccessInputByteSlice.java           |   8 ++
 .../randomaccess/RandomAccessTermsDict.java   |  14 +-
 .../lucene99/randomaccess/TermData.java       |  22 +++
 .../lucene99/randomaccess/TermDataReader.java | 110 ---------------
 .../randomaccess/TermDataReaderProvider.java  | 125 ++++++++++++++++++
 .../lucene99/randomaccess/TermStateCodec.java |   3 +
 .../randomaccess/TermStateCodecImpl.java      |  20 ++-
 .../lucene99/randomaccess/TermsImpl.java      |  13 +-
 .../TestRandomAccessTermsDictWriter.java      |   3 +-
 11 files changed, 200 insertions(+), 128 deletions(-)
 delete mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
 create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
index 55139ebf3a32..269d1e4753ec 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteArrayByteSlice.java
@@ -52,4 +52,12 @@ public byte[] getBytes(long pos, int length) {
     System.arraycopy(bytes, (int) pos, result, 0, length);
     return result;
   }
+
+  @Override
+  public void readBytesTo(byte[] destination, long pos, int length) {
+    if (length == 0) {
+      return;
+    }
+    System.arraycopy(bytes, (int) pos, destination, 0, length);
+  }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
index 937e915e3325..1a3a8a8f0f96 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/ByteSlice.java
@@ -29,4 +29,6 @@ interface ByteSlice {
   long getLong(long pos) throws IOException;
 
   byte[] getBytes(long pos, int length) throws IOException;
+
+  void readBytesTo(byte[] destination, long pos, int length) throws IOException;
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
index 3d80e50dd383..845b0f22aed4 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessInputByteSlice.java
@@ -55,4 +55,12 @@ public byte[] getBytes(long pos, int length) throws IOException {
     randomAccessInput.readBytes(pos, result, 0, length);
     return result;
   }
+
+  @Override
+  public void readBytesTo(byte[] destination, long pos, int length) throws IOException {
+    if (length == 0) {
+      return;
+    }
+    randomAccessInput.readBytes(pos, destination, 0, length);
+  }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
index 712c832d93c5..f767c2d4ed99 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
@@ -28,13 +28,15 @@
 record RandomAccessTermsDict(
     TermsStats termsStats,
     TermsIndex termsIndex,
-    TermDataReader termDataReader,
+    TermDataReaderProvider termDataReaderProvider,
     IndexOptions indexOptions) {
 
-  IntBlockTermState getTermState(BytesRef term, TermData[] termDataPerType) throws IOException {
+  /** test only * */
+  IntBlockTermState getTermState(BytesRef term) throws IOException {
     TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(term);
-    return termDataReader.getTermState(
-        typeAndOrd.termType(), typeAndOrd.ord(), indexOptions, termDataPerType);
+    return termDataReaderProvider
+        .newReader()
+        .getTermState(typeAndOrd.termType(), typeAndOrd.ord(), indexOptions);
   }
 
   static RandomAccessTermsDict deserialize(
@@ -60,8 +62,8 @@ static RandomAccessTermsDict deserialize(
     int numTermTypes = metaInput.readByte();
 
     // (3.2) read per TermType
-    TermDataReader.Builder termDataReaderBuilder =
-        new TermDataReader.Builder(indexOptions, hasPayloads);
+    TermDataReaderProvider.Builder termDataReaderBuilder =
+        new TermDataReaderProvider.Builder(indexOptions, hasPayloads);
     for (int i = 0; i < numTermTypes; i++) {
       TermType termType = TermType.fromId(metaInput.readByte());
       TermDataInput termDataInput = termDataInputProvider.getTermDataInputForType(termType);
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
index 6eba1a0cd51c..c72bef50451e 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
@@ -46,4 +46,26 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio
 
     return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex);
   }
+
+  IntBlockTermState getTermStateWithBuffer(
+      TermStateCodec codec, long ord, byte[] metaDataBuffer, byte[] dataBuffer) throws IOException {
+    long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK;
+    long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8);
+    long dataStartPos = metadata.getLong(metadataStartPos);
+
+    metadata.readBytesTo(metaDataBuffer, metadataStartPos + 8, codec.getMetadataBytesLength());
+    BytesRef metadataBytesRef = new BytesRef(metaDataBuffer);
+
+    int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef);
+    int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK));
+    int startBitIndex = dataBitIndex % 8;
+    int numBytesToRead = (startBitIndex + numBitsPerRecord) / 8;
+    if ((startBitIndex + numBitsPerRecord) % 8 > 0) {
+      numBytesToRead += 1;
+    }
+    data.readBytesTo(dataBuffer, dataStartPos + dataBitIndex / 8, numBytesToRead);
+    BytesRef dataBytesRef = new BytesRef(dataBuffer, 0, numBytesToRead);
+
+    return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex);
+  }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
deleted file mode 100644
index 7d9b701f9ab1..000000000000
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReader.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
-
-import java.io.IOException;
-import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.store.DataInput;
-import org.apache.lucene.store.IndexInput;
-
-/**
- * Holds all {@link TermData} per {@link TermType} for a field. Also manages the proper codec needed
- * per TermType.
- */
-record TermDataReader(TermDataProviderAndCodec[] termDataProviderAndCodecs) {
-
-  IntBlockTermState getTermState(
-      TermType termType, long ord, IndexOptions indexOptions, TermData[] termDataPerType)
-      throws IOException {
-    assert termDataProviderAndCodecs[termType.getId()] != null;
-    assert termDataPerType.length == termDataProviderAndCodecs.length;
-    assert termDataPerType[termType.getId()] != null;
-
-    var codec = termDataProviderAndCodecs[termType.getId()].codec;
-    IntBlockTermState termState = termDataPerType[termType.getId()].getTermState(codec, ord);
-
-    // need to filling some default values for the term state
-    // in order to meet the expectations of the postings reader
-    if (termType.hasSingletonDoc()) {
-      termState.docFreq = 1;
-    }
-    if (termType.hasSkipData() == false) {
-      termState.skipOffset = -1;
-    }
-    if (termType.hasLastPositionBlockOffset() == false) {
-      termState.lastPosBlockOffset = -1;
-    }
-
-    /* There is interesting conventions to follow...
-     * <pre>
-     *     org.apache.lucene.index.CheckIndex$CheckIndexException:
-     *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
-     * </pre>
-     */
-    // for field that do not have freq enabled, as if each posting only has one occurrence.
-    if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) {
-      termState.totalTermFreq = termState.docFreq;
-    }
-
-    return termState;
-  }
-
-  TermData[] newPerTypeTermDataReference() throws IOException {
-    TermData[] result = new TermData[termDataProviderAndCodecs.length];
-    for (int i = 0; i < result.length; i++) {
-      if (termDataProviderAndCodecs[i] == null) {
-        continue;
-      }
-      TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider;
-      result[i] =
-          new TermData(
-              termDataProvider.metadataProvider().newByteSlice(),
-              termDataProvider.dataProvider().newByteSlice());
-    }
-    return result;
-  }
-
-  static class Builder {
-    final IndexOptions indexOptions;
-    final boolean hasPayloads;
-    final TermDataProviderAndCodec[] termDataProviderAndCodecs =
-        new TermDataProviderAndCodec[TermType.NUM_TOTAL_TYPES];
-
-    Builder(IndexOptions indexOptions, boolean hasPayloads) {
-      this.indexOptions = indexOptions;
-      this.hasPayloads = hasPayloads;
-    }
-
-    void readOne(
-        TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn)
-        throws IOException {
-      TermDataProvider termDataProvider =
-          TermDataProvider.deserializeOffHeap(metaIn, termMetadataIn, termDataIn);
-      TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads);
-      termDataProviderAndCodecs[termType.getId()] =
-          new TermDataProviderAndCodec(termDataProvider, codec);
-    }
-
-    TermDataReader build() {
-      return new TermDataReader(termDataProviderAndCodecs);
-    }
-  }
-
-  record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {}
-}
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
new file mode 100644
index 000000000000..3572cc90773e
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+
+/** Factory class for {@link TermDataReader} which supports term lookup */
+record TermDataReaderProvider(TermDataProviderAndCodec[] termDataProviderAndCodecs) {
+
+  TermDataReader newReader() throws IOException {
+    return new TermDataReader();
+  }
+
+  static class Builder {
+    final IndexOptions indexOptions;
+    final boolean hasPayloads;
+    final TermDataProviderAndCodec[] termDataProviderAndCodecs =
+        new TermDataProviderAndCodec[TermType.NUM_TOTAL_TYPES];
+
+    Builder(IndexOptions indexOptions, boolean hasPayloads) {
+      this.indexOptions = indexOptions;
+      this.hasPayloads = hasPayloads;
+    }
+
+    void readOne(
+        TermType termType, DataInput metaIn, IndexInput termMetadataIn, IndexInput termDataIn)
+        throws IOException {
+      TermDataProvider termDataProvider =
+          TermDataProvider.deserializeOffHeap(metaIn, termMetadataIn, termDataIn);
+      TermStateCodec codec = TermStateCodecImpl.getCodec(termType, indexOptions, hasPayloads);
+      termDataProviderAndCodecs[termType.getId()] =
+          new TermDataProviderAndCodec(termDataProvider, codec);
+    }
+
+    TermDataReaderProvider build() {
+      return new TermDataReaderProvider(termDataProviderAndCodecs);
+    }
+  }
+
+  record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {}
+
+  public class TermDataReader {
+    private final TermData[] termDataPerType;
+
+    private final byte[][] metaDataBufferPerType;
+
+    private final byte[][] dataBufferPerType;
+
+    TermDataReader() throws IOException {
+      termDataPerType = new TermData[termDataProviderAndCodecs.length];
+      metaDataBufferPerType = new byte[termDataProviderAndCodecs.length][];
+      dataBufferPerType = new byte[termDataProviderAndCodecs.length][];
+
+      for (int i = 0; i < termDataProviderAndCodecs.length; i++) {
+        if (termDataProviderAndCodecs[i] == null) {
+          continue;
+        }
+        var codec = termDataProviderAndCodecs[i].codec;
+        TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider;
+        termDataPerType[i] =
+            new TermData(
+                termDataProvider.metadataProvider().newByteSlice(),
+                termDataProvider.dataProvider().newByteSlice());
+        metaDataBufferPerType[i] = new byte[codec.getMetadataBytesLength()];
+        dataBufferPerType[i] = new byte[codec.getMaximumRecordSizeInBytes()];
+      }
+    }
+
+    IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions)
+        throws IOException {
+      assert termDataProviderAndCodecs[termType.getId()] != null;
+      assert termDataPerType[termType.getId()] != null;
+
+      int typeId = termType.getId();
+      var codec = termDataProviderAndCodecs[termType.getId()].codec;
+      IntBlockTermState termState =
+          termDataPerType[typeId].getTermStateWithBuffer(
+              codec, ord, metaDataBufferPerType[typeId], dataBufferPerType[typeId]);
+
+      // need to filling some default values for the term state
+      // in order to meet the expectations of the postings reader
+      if (termType.hasSingletonDoc()) {
+        termState.docFreq = 1;
+      }
+      if (termType.hasSkipData() == false) {
+        termState.skipOffset = -1;
+      }
+      if (termType.hasLastPositionBlockOffset() == false) {
+        termState.lastPosBlockOffset = -1;
+      }
+
+      /* There is interesting conventions to follow...
+       * <pre>
+       *     org.apache.lucene.index.CheckIndex$CheckIndexException:
+       *     field "id" hasFreqs is false, but TermsEnum.totalTermFreq()=0 (should be 1)
+       * </pre>
+       */
+      // for field that do not have freq enabled, as if each posting only has one occurrence.
+      if (indexOptions.ordinal() < IndexOptions.DOCS_AND_FREQS.ordinal()) {
+        termState.totalTermFreq = termState.docFreq;
+      }
+
+      return termState;
+    }
+  }
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
index 283512c7ae6a..1ef79ab7f158 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
@@ -28,6 +28,9 @@ interface TermStateCodec {
   /** Get the number of bytes that the metadata per block needs. */
   int getMetadataBytesLength();
 
+  /** Get the maximum span of a record in terms of bytes */
+  int getMaximumRecordSizeInBytes();
+
   /** Get the number of bits per data record within the block, based on the provided metadata. */
   int getNumBitsPerRecord(BytesRef metadataBytes);
 
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index d1a8392a37a9..319c14faafba 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -51,6 +51,13 @@ public TermStateCodecImpl(TermStateCodecComponent[] components) {
     this.metadataBytesLength = metadataBytesLength;
   }
 
+  @Override
+  public int getMaximumRecordSizeInBytes() {
+    // worst case: no compression at all, so each component taks 8 byte.
+    // two extra bytes when the record takes partial byte at the start and end.
+    return components.length * 8 + 2;
+  }
+
   @Override
   public int getMetadataBytesLength() {
     return metadataBytesLength;
@@ -58,7 +65,18 @@ public int getMetadataBytesLength() {
 
   @Override
   public int getNumBitsPerRecord(BytesRef metadataBytes) {
-    return deserializedMetadata(metadataBytes).totalBitsPerTermState;
+    int upto = metadataBytes.offset;
+    int totalBitsPerTermState = 0;
+
+    for (var component : components) {
+      byte bitWidth = metadataBytes.bytes[upto++];
+      if (component.isMonotonicallyIncreasing()) {
+        upto += 8;
+      }
+      totalBitsPerTermState += bitWidth;
+    }
+
+    return totalBitsPerTermState;
   }
 
   private static int getMetadataLength(TermStateCodecComponent component) {
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index c521cba8a8ce..d3977e4d5252 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -129,25 +129,20 @@ final class RandomAccessTermsEnum extends TermsEnum {
     // We need to re-seek in next() calls to catch up to that term.
     private boolean needReSeekInNext;
 
-    private TermData[] perTypeTermData;
+    private final TermDataReaderProvider.TermDataReader termDataReader;
 
     RandomAccessTermsEnum() throws IOException {
       termState = (IntBlockTermState) lucene99PostingsReader.newTermState();
       fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst());
-      perTypeTermData = termsDict.termDataReader().newPerTypeTermDataReference();
+      termDataReader = termsDict.termDataReaderProvider().newReader();
     }
 
     void updateTermStateIfNeeded() throws IOException {
       if (!isTermStateCurrent && !needReSeekInNext) {
         TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstSeekState.output);
         termState =
-            termsDict
-                .termDataReader()
-                .getTermState(
-                    typeAndOrd.termType(),
-                    typeAndOrd.ord(),
-                    fieldInfo.getIndexOptions(),
-                    perTypeTermData);
+            termDataReader.getTermState(
+                typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions());
         isTermStateCurrent = true;
       }
     }
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
index b02d4de0cebf..226a4700813c 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestRandomAccessTermsDictWriter.java
@@ -138,10 +138,9 @@ public boolean hasPayloads(int fieldNumber) {
         result.expectedTermAndState()[result.expectedTermAndState().length - 1].term,
         deserialized.termsStats().maxTerm());
 
-    TermData[] perTypeTermData = deserialized.termDataReader().newPerTypeTermDataReference();
     for (var x : result.expectedTermAndState()) {
       IntBlockTermState expectedState = x.state;
-      IntBlockTermState actualState = deserialized.getTermState(x.term, perTypeTermData);
+      IntBlockTermState actualState = deserialized.getTermState(x.term);
       if (expectedState.singletonDocID != -1) {
         assertEquals(expectedState.singletonDocID, actualState.singletonDocID);
       } else {

From 46b46e64588b1e3a095db51bcc804816876f3c36 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sat, 25 Nov 2023 10:39:50 -0800
Subject: [PATCH 42/57] Make decode TermState allocation-free

---
 .../randomaccess/TermStateCodecImpl.java      | 52 ++++---------------
 1 file changed, 11 insertions(+), 41 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index 319c14faafba..adef80cba696 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -32,8 +32,8 @@
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermStateCodecComponent.TotalTermFreq;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitPacker;
 import org.apache.lucene.sandbox.codecs.lucene99.randomaccess.bitpacking.BitUnpacker;
-import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.BitUtil;
 import org.apache.lucene.util.BytesRef;
 
 final class TermStateCodecImpl implements TermStateCodec {
@@ -205,10 +205,8 @@ public IntBlockTermState decodeWithinBlock(
       BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int index) {
     assert metadataBytes.length == this.metadataBytesLength;
 
-    var metadata = deserializedMetadata(metadataBytes);
-
-    int startBitIndex = index * metadata.totalBitsPerTermState;
-    return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent);
+    int startBitIndex = index * getNumBitsPerRecord(metadataBytes);
+    return decodeAt(metadataBytes, dataBytes, bitUnpacker, startBitIndex);
   }
 
   @Override
@@ -216,51 +214,23 @@ public IntBlockTermState decodeAt(
       BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex) {
     assert metadataBytes.length == this.metadataBytesLength;
 
-    var metadata = deserializedMetadata(metadataBytes);
-    return extract(dataBytes, bitUnpacker, startBitIndex, metadata.metadataPerComponent);
-  }
+    int upto = metadataBytes.offset;
+    IntBlockTermState decoded = new IntBlockTermState();
 
-  private MetadataAndTotalBitsPerTermState deserializedMetadata(BytesRef metadataBytes) {
-    Metadata[] metadataPerComponent = new Metadata[components.length];
-    ByteArrayDataInput byteArrayDataInput =
-        new ByteArrayDataInput(metadataBytes.bytes, metadataBytes.offset, metadataBytes.length);
-    int totalBitsPerTermState = 0;
     for (int i = 0; i < components.length; i++) {
       var component = components[i];
-      byte bitWidth = byteArrayDataInput.readByte();
-      long referenceValue = -1;
+      int bitWidth = metadataBytes.bytes[upto++];
+      long val = bitUnpacker.unpack(dataBytes, startBitIndex, bitWidth);
       if (component.isMonotonicallyIncreasing()) {
-        referenceValue = byteArrayDataInput.readLong();
-      }
-      metadataPerComponent[i] = new Metadata(bitWidth, referenceValue);
-
-      totalBitsPerTermState += bitWidth;
-    }
-
-    return new MetadataAndTotalBitsPerTermState(metadataPerComponent, totalBitsPerTermState);
-  }
-
-  private IntBlockTermState extract(
-      BytesRef dataBytes,
-      BitUnpacker bitUnpacker,
-      int startBitIndex,
-      Metadata[] metadataPerComponent) {
-    IntBlockTermState decoded = new IntBlockTermState();
-    for (int i = 0; i < components.length; i++) {
-      var component = components[i];
-      var metadata = metadataPerComponent[i];
-      long val = bitUnpacker.unpack(dataBytes, startBitIndex, metadata.bitWidth);
-      if (metadata.referenceValue > 0) {
-        val += metadata.referenceValue;
+        val += (long) BitUtil.VH_LE_LONG.get(metadataBytes.bytes, upto);
+        upto += 8;
       }
       component.setTargetValue(decoded, val);
-      startBitIndex += metadata.bitWidth;
+      startBitIndex += bitWidth;
     }
+
     return decoded;
   }
 
   private record Metadata(byte bitWidth, long referenceValue) {}
-
-  private record MetadataAndTotalBitsPerTermState(
-      Metadata[] metadataPerComponent, int totalBitsPerTermState) {}
 }

From 2c875e7710df1c5a2db1d9da3f8e9f90266ac429 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sat, 25 Nov 2023 17:23:13 -0800
Subject: [PATCH 43/57] Use ThreadLocal to reuse TermDataReader data objects

---
 .../randomaccess/TermDataReaderProvider.java  | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
index 3572cc90773e..a65e9b1304c5 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
@@ -24,10 +24,25 @@
 import org.apache.lucene.store.IndexInput;
 
 /** Factory class for {@link TermDataReader} which supports term lookup */
-record TermDataReaderProvider(TermDataProviderAndCodec[] termDataProviderAndCodecs) {
+final class TermDataReaderProvider {
+  private final TermDataProviderAndCodec[] termDataProviderAndCodecs;
+
+  /** TermDataReader can be reused by the same thread */
+  private final ThreadLocal<TermDataReader> termDataReaderReuse;
+
+  TermDataReaderProvider(TermDataProviderAndCodec[] termDataProviderAndCodecs) {
+    this.termDataProviderAndCodecs = termDataProviderAndCodecs;
+    termDataReaderReuse = new ThreadLocal<>();
+  }
 
   TermDataReader newReader() throws IOException {
-    return new TermDataReader();
+    var existingReader = termDataReaderReuse.get();
+    if (existingReader != null) {
+      return existingReader;
+    }
+    var newReader = new TermDataReader();
+    termDataReaderReuse.set(newReader);
+    return newReader;
   }
 
   static class Builder {

From 6a71a8124ad8aa8709467c134dbb12bf79343b76 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sun, 26 Nov 2023 10:37:08 -0800
Subject: [PATCH 44/57] Forked FST.java to work with primtive long

Try it with TermsIndexPrimitive and verify basic functionality
---
 .../randomaccess/TermsIndexPrimitive.java     |   56 +
 .../randomaccess/TestTermsIndexBuilder.java   |   16 +-
 .../lucene/util/fst/PrimitiveLongFST.java     | 1329 +++++++++++++++++
 3 files changed, 1400 insertions(+), 1 deletion(-)
 create mode 100644 lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java
 create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java
new file mode 100644
index 000000000000..95e307d786d1
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexPrimitive.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.sandbox.codecs.lucene99.randomaccess;
+
+import static org.apache.lucene.sandbox.codecs.lucene99.randomaccess.TermsIndex.decodeLong;
+
+import java.io.IOException;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.fst.OffHeapFSTStore;
+import org.apache.lucene.util.fst.PrimitiveLongFST;
+
+record TermsIndexPrimitive(PrimitiveLongFST primitiveLongFST) {
+
+  TermsIndex.TypeAndOrd getTerm(BytesRef term) throws IOException {
+    long encoded = PrimitiveLongFST.get(primitiveLongFST, term);
+    return decodeLong(encoded);
+  }
+
+  static TermsIndexPrimitive deserialize(DataInput metaIn, DataInput dataIn, boolean loadOffHeap)
+      throws IOException {
+    PrimitiveLongFST fst;
+    if (loadOffHeap) {
+      var fstStore = new OffHeapFSTStore();
+      fst =
+          new PrimitiveLongFST(
+              PrimitiveLongFST.readMetadata(
+                  metaIn, PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()),
+              dataIn.clone(),
+              fstStore);
+      dataIn.skipBytes(fstStore.size());
+    } else {
+      fst =
+          new PrimitiveLongFST(
+              PrimitiveLongFST.readMetadata(
+                  metaIn, PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()),
+              dataIn);
+    }
+    return new TermsIndexPrimitive(fst);
+  }
+}
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
index 7179c23d1d7e..9528dcd69b0d 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
@@ -20,6 +20,9 @@
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.BytesRef;
 
@@ -49,9 +52,20 @@ public void testBasics() throws IOException {
     }
     TermsIndex termsIndex = builder.build();
 
+    byte[] metaBytes = new byte[4096];
+    byte[] dataBytes = new byte[4096];
+    DataOutput metaOut = new ByteArrayDataOutput(metaBytes);
+    DataOutput dataOutput = new ByteArrayDataOutput(dataBytes);
+
+    termsIndex.serialize(metaOut, dataOutput);
+
+    TermsIndexPrimitive termsIndexPrimitive =
+        TermsIndexPrimitive.deserialize(
+            new ByteArrayDataInput(metaBytes), new ByteArrayDataInput(dataBytes), false);
+
     for (String term : test_terms) {
       BytesRef termBytes = new BytesRef(term);
-      TermsIndex.TypeAndOrd typeAndOrd = termsIndex.getTerm(termBytes);
+      TermsIndex.TypeAndOrd typeAndOrd = termsIndexPrimitive.getTerm(termBytes);
 
       assertEquals(termsToType.get(term).intValue(), typeAndOrd.termType().getId());
       assertEquals((long) termsToOrd.get(term), typeAndOrd.ord());
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java
new file mode 100644
index 000000000000..c4a188fc58e6
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java
@@ -0,0 +1,1329 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.ByteBuffersDataOutput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Constants;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.fst.FST.BytesReader;
+import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc.BitTable;
+
+/**
+ * HACK!
+ *
+ * <p>A copy of {@link FST} but remove generics to work with primitive types and avoid
+ * boxing-unboxing.
+ *
+ * @lucene.experimental
+ */
+public final class PrimitiveLongFST implements Accountable {
+
+  final PrimitiveLongFSTMetadata metadata;
+
+  /** Specifies allowed range of each int input label for this FST. */
+  public enum INPUT_TYPE {
+    BYTE1,
+    BYTE2,
+    BYTE4
+  }
+
+  private static final long BASE_RAM_BYTES_USED =
+      RamUsageEstimator.shallowSizeOfInstance(PrimitiveLongFST.class);
+
+  static final int BIT_FINAL_ARC = 1 << 0;
+  static final int BIT_LAST_ARC = 1 << 1;
+  static final int BIT_TARGET_NEXT = 1 << 2;
+
+  // TODO: we can free up a bit if we can nuke this:
+  static final int BIT_STOP_NODE = 1 << 3;
+
+  /** This flag is set if the arc has an output. */
+  public static final int BIT_ARC_HAS_OUTPUT = 1 << 4;
+
+  static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
+
+  /**
+   * Value of the arc flags to declare a node with fixed length (sparse) arcs designed for binary
+   * search.
+   */
+  // We use this as a marker because this one flag is illegal by itself.
+  public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT;
+
+  /**
+   * Value of the arc flags to declare a node with fixed length dense arcs and bit table designed
+   * for direct addressing.
+   */
+  static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
+
+  /**
+   * Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly
+   * with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations
+   * that will not occur at the same time.
+   */
+  static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
+
+  // Increment version to change it
+  private static final String FILE_FORMAT_NAME = "FST";
+  private static final int VERSION_START = 6;
+  private static final int VERSION_LITTLE_ENDIAN = 8;
+  private static final int VERSION_CONTINUOUS_ARCS = 9;
+  static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS;
+
+  // Never serialized; just used to represent the virtual
+  // final node w/ no arcs:
+  static final long FINAL_END_NODE = -1;
+
+  // Never serialized; just used to represent the virtual
+  // non-final node w/ no arcs:
+  static final long NON_FINAL_END_NODE = 0;
+
+  /** If arc has this label then that arc is final/accepted */
+  public static final int END_LABEL = -1;
+
+  /**
+   * A {@link BytesStore}, used during building, or during reading when the FST is very large (more
+   * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
+   */
+  private final FSTReader fstReader;
+
+  public final PrimitiveLongFSTOutputs outputs;
+
+  /** Represents a single arc. */
+  public static final class PrimitiveLongArc {
+
+    // *** Arc fields.
+
+    private int label;
+
+    private long output;
+
+    private long target;
+
+    private byte flags;
+
+    private long nextFinalOutput;
+
+    private long nextArc;
+
+    private byte nodeFlags;
+
+    // *** Fields for arcs belonging to a node with fixed length arcs.
+    // So only valid when bytesPerArc != 0.
+    // nodeFlags == ARCS_FOR_BINARY_SEARCH || nodeFlags == ARCS_FOR_DIRECT_ADDRESSING.
+
+    private int bytesPerArc;
+
+    private long posArcsStart;
+
+    private int arcIdx;
+
+    private int numArcs;
+
+    // *** Fields for a direct addressing node. nodeFlags == ARCS_FOR_DIRECT_ADDRESSING.
+
+    /**
+     * Start position in the {@link BytesReader} of the presence bits for a direct addressing node,
+     * aka the bit-table
+     */
+    private long bitTableStart;
+
+    /** First label of a direct addressing node. */
+    private int firstLabel;
+
+    /**
+     * Index of the current label of a direct addressing node. While {@link #arcIdx} is the current
+     * index in the label range, {@link #presenceIndex} is its corresponding index in the list of
+     * actually present labels. It is equal to the number of bits set before the bit at {@link
+     * #arcIdx} in the bit-table. This field is a cache to avoid to count bits set repeatedly when
+     * iterating the next arcs.
+     */
+    private int presenceIndex;
+
+    /** Returns this */
+    public PrimitiveLongArc copyFrom(PrimitiveLongArc other) {
+      label = other.label();
+      target = other.target();
+      flags = other.flags();
+      output = other.output();
+      nextFinalOutput = other.nextFinalOutput();
+      nextArc = other.nextArc();
+      nodeFlags = other.nodeFlags();
+      bytesPerArc = other.bytesPerArc();
+
+      // Fields for arcs belonging to a node with fixed length arcs.
+      // We could avoid copying them if bytesPerArc() == 0 (this was the case with previous code,
+      // and the current code
+      // still supports that), but it may actually help external uses of FST to have consistent arc
+      // state, and debugging
+      // is easier.
+      posArcsStart = other.posArcsStart();
+      arcIdx = other.arcIdx();
+      numArcs = other.numArcs();
+      bitTableStart = other.bitTableStart;
+      firstLabel = other.firstLabel();
+      presenceIndex = other.presenceIndex;
+
+      return this;
+    }
+
+    boolean flag(int flag) {
+      return PrimitiveLongFST.flag(flags, flag);
+    }
+
+    public boolean isLast() {
+      return flag(BIT_LAST_ARC);
+    }
+
+    public boolean isFinal() {
+      return flag(BIT_FINAL_ARC);
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder b = new StringBuilder();
+      b.append(" target=").append(target());
+      b.append(" label=0x").append(Integer.toHexString(label()));
+      if (flag(BIT_FINAL_ARC)) {
+        b.append(" final");
+      }
+      if (flag(BIT_LAST_ARC)) {
+        b.append(" last");
+      }
+      if (flag(BIT_TARGET_NEXT)) {
+        b.append(" targetNext");
+      }
+      if (flag(BIT_STOP_NODE)) {
+        b.append(" stop");
+      }
+      if (flag(BIT_ARC_HAS_OUTPUT)) {
+        b.append(" output=").append(output());
+      }
+      if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
+        b.append(" nextFinalOutput=").append(nextFinalOutput());
+      }
+      if (bytesPerArc() != 0) {
+        b.append(" arcArray(idx=")
+            .append(arcIdx())
+            .append(" of ")
+            .append(numArcs())
+            .append(")")
+            .append("(")
+            .append(
+                nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
+                    ? "da"
+                    : nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
+            .append(")");
+      }
+      return b.toString();
+    }
+
+    public int label() {
+      return label;
+    }
+
+    public long output() {
+      return output;
+    }
+
+    /** Ord/address to target node. */
+    public long target() {
+      return target;
+    }
+
+    public byte flags() {
+      return flags;
+    }
+
+    public long nextFinalOutput() {
+      return nextFinalOutput;
+    }
+
+    /**
+     * Address (into the byte[]) of the next arc - only for list of variable length arc. Or
+     * ord/address to the next node if label == {@link #END_LABEL}.
+     */
+    long nextArc() {
+      return nextArc;
+    }
+
+    /** Where we are in the array; only valid if bytesPerArc != 0. */
+    public int arcIdx() {
+      return arcIdx;
+    }
+
+    /**
+     * Node header flags. Only meaningful to check if the value is either {@link
+     * #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
+     * #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
+     */
+    public byte nodeFlags() {
+      return nodeFlags;
+    }
+
+    /** Where the first arc in the array starts; only valid if bytesPerArc != 0 */
+    public long posArcsStart() {
+      return posArcsStart;
+    }
+
+    /**
+     * Non-zero if this arc is part of a node with fixed length arcs, which means all arcs for the
+     * node are encoded with a fixed number of bytes so that we binary search or direct address. We
+     * do when there are enough arcs leaving one node. It wastes some bytes but gives faster
+     * lookups.
+     */
+    public int bytesPerArc() {
+      return bytesPerArc;
+    }
+
+    /**
+     * How many arcs; only valid if bytesPerArc != 0 (fixed length arcs). For a node designed for
+     * binary search this is the array size. For a node designed for direct addressing, this is the
+     * label range.
+     */
+    public int numArcs() {
+      return numArcs;
+    }
+
+    /**
+     * First label of a direct addressing node. Only valid if nodeFlags == {@link
+     * #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
+     */
+    int firstLabel() {
+      return firstLabel;
+    }
+
+    /**
+     * Helper methods to read the bit-table of a direct addressing node. Only valid for {@link
+     * PrimitiveLongArc} with {@link PrimitiveLongArc#nodeFlags()} == {@code
+     * ARCS_FOR_DIRECT_ADDRESSING}.
+     */
+    static class BitTable {
+
+      /** See {@link BitTableUtil#isBitSet(int, BytesReader)}. */
+      static boolean isBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in)
+          throws IOException {
+        assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
+        in.setPosition(arc.bitTableStart);
+        return BitTableUtil.isBitSet(bitIndex, in);
+      }
+
+      /**
+       * See {@link BitTableUtil#countBits(int, BytesReader)}. The count of bit set is the number of
+       * arcs of a direct addressing node.
+       */
+      static int countBits(PrimitiveLongArc arc, BytesReader in) throws IOException {
+        assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
+        in.setPosition(arc.bitTableStart);
+        return BitTableUtil.countBits(getNumPresenceBytes(arc.numArcs()), in);
+      }
+
+      /** See {@link BitTableUtil#countBitsUpTo(int, BytesReader)}. */
+      static int countBitsUpTo(int bitIndex, PrimitiveLongArc arc, BytesReader in)
+          throws IOException {
+        assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
+        in.setPosition(arc.bitTableStart);
+        return BitTableUtil.countBitsUpTo(bitIndex, in);
+      }
+
+      /** See {@link BitTableUtil#nextBitSet(int, int, BytesReader)}. */
+      static int nextBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in) throws IOException {
+        assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
+        in.setPosition(arc.bitTableStart);
+        return BitTableUtil.nextBitSet(bitIndex, getNumPresenceBytes(arc.numArcs()), in);
+      }
+
+      /** See {@link BitTableUtil#previousBitSet(int, BytesReader)}. */
+      static int previousBitSet(int bitIndex, PrimitiveLongArc arc, BytesReader in)
+          throws IOException {
+        assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
+        in.setPosition(arc.bitTableStart);
+        return BitTableUtil.previousBitSet(bitIndex, in);
+      }
+
+      /** Asserts the bit-table of the provided {@link PrimitiveLongArc} is valid. */
+      static boolean assertIsValid(PrimitiveLongArc arc, BytesReader in) throws IOException {
+        assert arc.bytesPerArc() > 0;
+        assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
+        // First bit must be set.
+        assert isBitSet(0, arc, in);
+        // Last bit must be set.
+        assert isBitSet(arc.numArcs() - 1, arc, in);
+        // No bit set after the last arc.
+        assert nextBitSet(arc.numArcs() - 1, arc, in) == -1;
+        return true;
+      }
+    }
+  }
+
+  private static boolean flag(int flags, int bit) {
+    return (flags & bit) != 0;
+  }
+
+  private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
+
+  /**
+   * Load a previously saved FST with a DataInput for metdata using an {@link OnHeapFSTStore} with
+   * maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS}
+   */
+  public PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, DataInput in) throws IOException {
+    this(metadata, in, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
+  }
+
+  /**
+   * Load a previously saved FST with a metdata object and a FSTStore. If using {@link
+   * OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
+   * to hold the FST bytes.
+   */
+  public PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, DataInput in, FSTStore fstStore)
+      throws IOException {
+    this(metadata, fstStore.init(in, metadata.numBytes));
+  }
+
+  /** Create the FST with a metadata object and a FSTReader. */
+  PrimitiveLongFST(PrimitiveLongFSTMetadata metadata, FSTReader fstReader) {
+    this.metadata = metadata;
+    this.outputs = metadata.outputs;
+    this.fstReader = fstReader;
+  }
+
+  /**
+   * Read the FST metadata from DataInput
+   *
+   * @param metaIn the DataInput of the metadata
+   * @param outputs the FST outputs
+   * @return the FST metadata
+   * @throws IOException if exception occurred during parsing
+   */
+  public static PrimitiveLongFSTMetadata readMetadata(
+      DataInput metaIn, PrimitiveLongFSTOutputs outputs) throws IOException {
+    // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
+    // back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it
+    int version = CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
+    Long emptyOutput;
+    if (metaIn.readByte() == 1) {
+      // accepts empty string
+      // 1 KB blocks:
+      BytesStore emptyBytes = new BytesStore(10);
+      int numBytes = metaIn.readVInt();
+      emptyBytes.copyBytes(metaIn, numBytes);
+
+      // De-serialize empty-string output:
+      BytesReader reader = emptyBytes.getReverseBytesReader();
+      // NoOutputs uses 0 bytes when writing its output,
+      // so we have to check here else BytesStore gets
+      // angry:
+      if (numBytes > 0) {
+        reader.setPosition(numBytes - 1);
+      }
+      emptyOutput = outputs.readFinalOutput(reader);
+    } else {
+      emptyOutput = null;
+    }
+    INPUT_TYPE inputType;
+    final byte t = metaIn.readByte();
+    switch (t) {
+      case 0:
+        inputType = INPUT_TYPE.BYTE1;
+        break;
+      case 1:
+        inputType = INPUT_TYPE.BYTE2;
+        break;
+      case 2:
+        inputType = INPUT_TYPE.BYTE4;
+        break;
+      default:
+        throw new CorruptIndexException("invalid input type " + t, metaIn);
+    }
+    long startNode = metaIn.readVLong();
+    long numBytes = metaIn.readVLong();
+    return new PrimitiveLongFSTMetadata(
+        inputType, outputs, emptyOutput, startNode, version, numBytes);
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed();
+  }
+
+  @Override
+  public String toString() {
+    return getClass().getSimpleName() + "(input=" + metadata.inputType + ",output=" + outputs;
+  }
+
+  public long numBytes() {
+    return metadata.numBytes;
+  }
+
+  public long getEmptyOutput() {
+    return metadata.emptyOutput.longValue();
+  }
+
+  public PrimitiveLongFSTMetadata getMetadata() {
+    return metadata;
+  }
+
+  public void save(DataOutput metaOut, DataOutput out) throws IOException {
+    saveMetadata(metaOut);
+    fstReader.writeTo(out);
+  }
+
+  /**
+   * Save the metadata to a DataOutput
+   *
+   * @param metaOut the DataOutput to save
+   */
+  public void saveMetadata(DataOutput metaOut) throws IOException {
+    CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT);
+
+    // Accepts empty string
+    metaOut.writeByte((byte) 1);
+
+    if (metadata.emptyOutput != null) {
+      // Serialize empty-string output:
+      ByteBuffersDataOutput ros = new ByteBuffersDataOutput();
+      outputs.writeFinalOutput(metadata.emptyOutput.longValue(), ros);
+      byte[] emptyOutputBytes = ros.toArrayCopy();
+      int emptyLen = emptyOutputBytes.length;
+
+      // reverse
+      final int stopAt = emptyLen / 2;
+      int upto = 0;
+      while (upto < stopAt) {
+        final byte b = emptyOutputBytes[upto];
+        emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1];
+        emptyOutputBytes[emptyLen - upto - 1] = b;
+        upto++;
+      }
+      metaOut.writeVInt(emptyLen);
+      metaOut.writeBytes(emptyOutputBytes, 0, emptyLen);
+    } else {
+      metaOut.writeByte((byte) 0);
+    }
+
+    final byte t;
+    if (metadata.inputType == INPUT_TYPE.BYTE1) {
+      t = 0;
+    } else if (metadata.inputType == INPUT_TYPE.BYTE2) {
+      t = 1;
+    } else {
+      t = 2;
+    }
+    metaOut.writeByte(t);
+    metaOut.writeVLong(metadata.startNode);
+    metaOut.writeVLong(numBytes());
+  }
+
+  /** Writes an automaton to a file. */
+  public void save(final Path path) throws IOException {
+    try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) {
+      DataOutput out = new OutputStreamDataOutput(os);
+      save(out, out);
+    }
+  }
+
+  /** Reads an automaton from a file. */
+  public static PrimitiveLongFST read(Path path, PrimitiveLongFSTOutputs outputs)
+      throws IOException {
+    try (InputStream is = Files.newInputStream(path)) {
+      DataInput in = new InputStreamDataInput(new BufferedInputStream(is));
+      return new PrimitiveLongFST(readMetadata(in, outputs), in);
+    }
+  }
+
+  /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */
+  public int readLabel(DataInput in) throws IOException {
+    final int v;
+    if (metadata.inputType == INPUT_TYPE.BYTE1) {
+      // Unsigned byte:
+      v = in.readByte() & 0xFF;
+    } else if (metadata.inputType == INPUT_TYPE.BYTE2) {
+      // Unsigned short:
+      if (metadata.version < VERSION_LITTLE_ENDIAN) {
+        v = Short.reverseBytes(in.readShort()) & 0xFFFF;
+      } else {
+        v = in.readShort() & 0xFFFF;
+      }
+    } else {
+      v = in.readVInt();
+    }
+    return v;
+  }
+
+  /** returns true if the node at this address has any outgoing arcs */
+  public static boolean targetHasArcs(PrimitiveLongArc arc) {
+    return arc.target() > 0;
+  }
+
+  /**
+   * Gets the number of bytes required to flag the presence of each arc in the given label range,
+   * one bit per arc.
+   */
+  static int getNumPresenceBytes(int labelRange) {
+    assert labelRange >= 0;
+    return (labelRange + 7) >> 3;
+  }
+
+  /**
+   * Reads the presence bits of a direct-addressing node. Actually we don't read them here, we just
+   * keep the pointer to the bit-table start and we skip them.
+   */
+  private void readPresenceBytes(PrimitiveLongArc arc, BytesReader in) throws IOException {
+    assert arc.bytesPerArc() > 0;
+    assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
+    arc.bitTableStart = in.getPosition();
+    in.skipBytes(getNumPresenceBytes(arc.numArcs()));
+  }
+
+  /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */
+  public PrimitiveLongArc getFirstArc(PrimitiveLongArc arc) {
+    long NO_OUTPUT = outputs.getNoOutput();
+
+    arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC;
+    if (metadata.emptyOutput != null) {
+      arc.nextFinalOutput = metadata.emptyOutput.longValue();
+    }
+    if (metadata.emptyOutput != null && metadata.emptyOutput.longValue() != NO_OUTPUT) {
+      arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT);
+    }
+
+    arc.output = NO_OUTPUT;
+
+    // If there are no nodes, ie, the FST only accepts the
+    // empty string, then startNode is 0
+    arc.target = metadata.startNode;
+    return arc;
+  }
+
+  /**
+   * Follows the <code>follow</code> arc and reads the last arc of its target; this changes the
+   * provided <code>arc</code> (2nd arg) in-place and returns it.
+   *
+   * @return Returns the second argument (<code>arc</code>).
+   */
+  PrimitiveLongArc readLastTargetArc(PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in)
+      throws IOException {
+    // System.out.println("readLast");
+    if (!targetHasArcs(follow)) {
+      // System.out.println("  end node");
+      assert follow.isFinal();
+      arc.label = END_LABEL;
+      arc.target = FINAL_END_NODE;
+      arc.output = follow.nextFinalOutput();
+      arc.flags = BIT_LAST_ARC;
+      arc.nodeFlags = arc.flags;
+      return arc;
+    } else {
+      in.setPosition(follow.target());
+      byte flags = arc.nodeFlags = in.readByte();
+      if (flags == ARCS_FOR_BINARY_SEARCH
+          || flags == ARCS_FOR_DIRECT_ADDRESSING
+          || flags == ARCS_FOR_CONTINUOUS) {
+        // Special arc which is actually a node header for fixed length arcs.
+        // Jump straight to end to find the last arc.
+        arc.numArcs = in.readVInt();
+        arc.bytesPerArc = in.readVInt();
+        // System.out.println("  array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
+        if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
+          readPresenceBytes(arc, in);
+          arc.firstLabel = readLabel(in);
+          arc.posArcsStart = in.getPosition();
+          readLastArcByDirectAddressing(arc, in);
+        } else if (flags == ARCS_FOR_BINARY_SEARCH) {
+          arc.arcIdx = arc.numArcs() - 2;
+          arc.posArcsStart = in.getPosition();
+          readNextRealArc(arc, in);
+        } else {
+          arc.firstLabel = readLabel(in);
+          arc.posArcsStart = in.getPosition();
+          readLastArcByContinuous(arc, in);
+        }
+      } else {
+        arc.flags = flags;
+        // non-array: linear scan
+        arc.bytesPerArc = 0;
+        // System.out.println("  scan");
+        while (!arc.isLast()) {
+          // skip this arc:
+          readLabel(in);
+          if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
+            outputs.skipOutput(in);
+          }
+          if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
+            outputs.skipFinalOutput(in);
+          }
+          if (arc.flag(BIT_STOP_NODE)) {
+          } else if (arc.flag(BIT_TARGET_NEXT)) {
+          } else {
+            readUnpackedNodeTarget(in);
+          }
+          arc.flags = in.readByte();
+        }
+        // Undo the byte flags we read:
+        in.skipBytes(-1);
+        arc.nextArc = in.getPosition();
+        readNextRealArc(arc, in);
+      }
+      assert arc.isLast();
+      return arc;
+    }
+  }
+
+  private long readUnpackedNodeTarget(BytesReader in) throws IOException {
+    return in.readVLong();
+  }
+
+  /**
+   * Follow the <code>follow</code> arc and read the first arc of its target; this changes the
+   * provided <code>arc</code> (2nd arg) in-place and returns it.
+   *
+   * @return Returns the second argument (<code>arc</code>).
+   */
+  public PrimitiveLongArc readFirstTargetArc(
+      PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in) throws IOException {
+    // int pos = address;
+    // System.out.println("    readFirstTarget follow.target=" + follow.target + " isFinal=" +
+    // follow.isFinal());
+    if (follow.isFinal()) {
+      // Insert "fake" final first arc:
+      arc.label = END_LABEL;
+      arc.output = follow.nextFinalOutput();
+      arc.flags = BIT_FINAL_ARC;
+      if (follow.target() <= 0) {
+        arc.flags |= BIT_LAST_ARC;
+      } else {
+        // NOTE: nextArc is a node (not an address!) in this case:
+        arc.nextArc = follow.target();
+      }
+      arc.target = FINAL_END_NODE;
+      arc.nodeFlags = arc.flags;
+      // System.out.println("    insert isFinal; nextArc=" + follow.target + " isLast=" +
+      // arc.isLast() + " output=" + outputs.outputToString(arc.output));
+      return arc;
+    } else {
+      return readFirstRealTargetArc(follow.target(), arc, in);
+    }
+  }
+
+  private void readFirstArcInfo(long nodeAddress, PrimitiveLongArc arc, final BytesReader in)
+      throws IOException {
+    in.setPosition(nodeAddress);
+
+    byte flags = arc.nodeFlags = in.readByte();
+    if (flags == ARCS_FOR_BINARY_SEARCH
+        || flags == ARCS_FOR_DIRECT_ADDRESSING
+        || flags == ARCS_FOR_CONTINUOUS) {
+      // Special arc which is actually a node header for fixed length arcs.
+      arc.numArcs = in.readVInt();
+      arc.bytesPerArc = in.readVInt();
+      arc.arcIdx = -1;
+      if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
+        readPresenceBytes(arc, in);
+        arc.firstLabel = readLabel(in);
+        arc.presenceIndex = -1;
+      } else if (flags == ARCS_FOR_CONTINUOUS) {
+        arc.firstLabel = readLabel(in);
+      }
+      arc.posArcsStart = in.getPosition();
+    } else {
+      arc.nextArc = nodeAddress;
+      arc.bytesPerArc = 0;
+    }
+  }
+
+  public PrimitiveLongArc readFirstRealTargetArc(
+      long nodeAddress, PrimitiveLongArc arc, final BytesReader in) throws IOException {
+    readFirstArcInfo(nodeAddress, arc, in);
+    return readNextRealArc(arc, in);
+  }
+
+  /**
+   * Returns whether <code>arc</code>'s target points to a node in expanded format (fixed length
+   * arcs).
+   */
+  boolean isExpandedTarget(PrimitiveLongArc follow, BytesReader in) throws IOException {
+    if (!targetHasArcs(follow)) {
+      return false;
+    } else {
+      in.setPosition(follow.target());
+      byte flags = in.readByte();
+      return flags == ARCS_FOR_BINARY_SEARCH
+          || flags == ARCS_FOR_DIRECT_ADDRESSING
+          || flags == ARCS_FOR_CONTINUOUS;
+    }
+  }
+
+  /** In-place read; returns the arc. */
+  public PrimitiveLongArc readNextArc(PrimitiveLongArc arc, BytesReader in) throws IOException {
+    if (arc.label() == END_LABEL) {
+      // This was a fake inserted "final" arc
+      if (arc.nextArc() <= 0) {
+        throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true");
+      }
+      return readFirstRealTargetArc(arc.nextArc(), arc, in);
+    } else {
+      return readNextRealArc(arc, in);
+    }
+  }
+
+  /** Peeks at next arc's label; does not alter arc. Do not call this if arc.isLast()! */
+  int readNextArcLabel(PrimitiveLongArc arc, BytesReader in) throws IOException {
+    assert !arc.isLast();
+
+    if (arc.label() == END_LABEL) {
+      // System.out.println("    nextArc fake " + arc.nextArc);
+      // Next arc is the first arc of a node.
+      // Position to read the first arc label.
+
+      in.setPosition(arc.nextArc());
+      byte flags = in.readByte();
+      if (flags == ARCS_FOR_BINARY_SEARCH
+          || flags == ARCS_FOR_DIRECT_ADDRESSING
+          || flags == ARCS_FOR_CONTINUOUS) {
+        // System.out.println("    nextArc fixed length arc");
+        // Special arc which is actually a node header for fixed length arcs.
+        int numArcs = in.readVInt();
+        in.readVInt(); // Skip bytesPerArc.
+        if (flags == ARCS_FOR_BINARY_SEARCH) {
+          in.readByte(); // Skip arc flags.
+        } else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
+          in.skipBytes(getNumPresenceBytes(numArcs));
+        } // Nothing to do for ARCS_FOR_CONTINUOUS
+      }
+    } else {
+      switch (arc.nodeFlags()) {
+        case ARCS_FOR_BINARY_SEARCH:
+          // Point to next arc, -1 to skip arc flags.
+          in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * (long) arc.bytesPerArc() - 1);
+          break;
+        case ARCS_FOR_DIRECT_ADDRESSING:
+          // Direct addressing node. The label is not stored but rather inferred
+          // based on first label and arc index in the range.
+          assert BitTable.assertIsValid(arc, in);
+          assert BitTable.isBitSet(arc.arcIdx(), arc, in);
+          int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
+          assert nextIndex != -1;
+          return arc.firstLabel() + nextIndex;
+        case ARCS_FOR_CONTINUOUS:
+          return arc.firstLabel() + arc.arcIdx() + 1;
+        default:
+          // Variable length arcs - linear search.
+          assert arc.bytesPerArc() == 0;
+          // Arcs have variable length.
+          // System.out.println("    nextArc real list");
+          // Position to next arc, -1 to skip flags.
+          in.setPosition(arc.nextArc() - 1);
+          break;
+      }
+    }
+    return readLabel(in);
+  }
+
+  public PrimitiveLongArc readArcByIndex(PrimitiveLongArc arc, final BytesReader in, int idx)
+      throws IOException {
+    assert arc.bytesPerArc() > 0;
+    assert arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH;
+    assert idx >= 0 && idx < arc.numArcs();
+    in.setPosition(arc.posArcsStart() - idx * (long) arc.bytesPerArc());
+    arc.arcIdx = idx;
+    arc.flags = in.readByte();
+    return readArc(arc, in);
+  }
+
+  /**
+   * Reads a Continuous node arc, with the provided index in the label range.
+   *
+   * @param rangeIndex The index of the arc in the label range. It must be within the label range.
+   */
+  public PrimitiveLongArc readArcByContinuous(
+      PrimitiveLongArc arc, final BytesReader in, int rangeIndex) throws IOException {
+    assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
+    in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
+    arc.arcIdx = rangeIndex;
+    arc.flags = in.readByte();
+    return readArc(arc, in);
+  }
+
+  /**
+   * Reads a present direct addressing node arc, with the provided index in the label range.
+   *
+   * @param rangeIndex The index of the arc in the label range. It must be present. The real arc
+   *     offset is computed based on the presence bits of the direct addressing node.
+   */
+  public PrimitiveLongArc readArcByDirectAddressing(
+      PrimitiveLongArc arc, final BytesReader in, int rangeIndex) throws IOException {
+    assert BitTable.assertIsValid(arc, in);
+    assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
+    assert BitTable.isBitSet(rangeIndex, arc, in);
+    int presenceIndex = BitTable.countBitsUpTo(rangeIndex, arc, in);
+    return readArcByDirectAddressing(arc, in, rangeIndex, presenceIndex);
+  }
+
+  /**
+   * Reads a present direct addressing node arc, with the provided index in the label range and its
+   * corresponding presence index (which is the count of presence bits before it).
+   */
+  private PrimitiveLongArc readArcByDirectAddressing(
+      PrimitiveLongArc arc, final BytesReader in, int rangeIndex, int presenceIndex)
+      throws IOException {
+    in.setPosition(arc.posArcsStart() - presenceIndex * (long) arc.bytesPerArc());
+    arc.arcIdx = rangeIndex;
+    arc.presenceIndex = presenceIndex;
+    arc.flags = in.readByte();
+    return readArc(arc, in);
+  }
+
+  /**
+   * Reads the last arc of a direct addressing node. This method is equivalent to call {@link
+   * #readArcByDirectAddressing(PrimitiveLongArc, BytesReader, int)} with {@code rangeIndex} equal
+   * to {@code arc.numArcs() - 1}, but it is faster.
+   */
+  public PrimitiveLongArc readLastArcByDirectAddressing(PrimitiveLongArc arc, final BytesReader in)
+      throws IOException {
+    assert BitTable.assertIsValid(arc, in);
+    int presenceIndex = BitTable.countBits(arc, in) - 1;
+    return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
+  }
+
+  /** Reads the last arc of a continuous node. */
+  public PrimitiveLongArc readLastArcByContinuous(PrimitiveLongArc arc, final BytesReader in)
+      throws IOException {
+    return readArcByContinuous(arc, in, arc.numArcs() - 1);
+  }
+
+  /** Never returns null, but you should never call this if arc.isLast() is true. */
+  public PrimitiveLongArc readNextRealArc(PrimitiveLongArc arc, final BytesReader in)
+      throws IOException {
+
+    // TODO: can't assert this because we call from readFirstArc
+    // assert !flag(arc.flags, BIT_LAST_ARC);
+
+    switch (arc.nodeFlags()) {
+      case ARCS_FOR_BINARY_SEARCH:
+      case ARCS_FOR_CONTINUOUS:
+        assert arc.bytesPerArc() > 0;
+        arc.arcIdx++;
+        assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
+        in.setPosition(arc.posArcsStart() - arc.arcIdx() * (long) arc.bytesPerArc());
+        arc.flags = in.readByte();
+        break;
+
+      case ARCS_FOR_DIRECT_ADDRESSING:
+        assert BitTable.assertIsValid(arc, in);
+        assert arc.arcIdx() == -1 || BitTable.isBitSet(arc.arcIdx(), arc, in);
+        int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
+        return readArcByDirectAddressing(arc, in, nextIndex, arc.presenceIndex + 1);
+
+      default:
+        // Variable length arcs - linear search.
+        assert arc.bytesPerArc() == 0;
+        in.setPosition(arc.nextArc());
+        arc.flags = in.readByte();
+    }
+    return readArc(arc, in);
+  }
+
+  /**
+   * Reads an arc. <br>
+   * Precondition: The arc flags byte has already been read and set; the given BytesReader is
+   * positioned just after the arc flags byte.
+   */
+  private PrimitiveLongArc readArc(PrimitiveLongArc arc, BytesReader in) throws IOException {
+    if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
+      arc.label = arc.firstLabel() + arc.arcIdx();
+    } else {
+      arc.label = readLabel(in);
+    }
+
+    if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
+      arc.output = outputs.read(in);
+    } else {
+      arc.output = outputs.getNoOutput();
+    }
+
+    if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
+      arc.nextFinalOutput = outputs.readFinalOutput(in);
+    } else {
+      arc.nextFinalOutput = outputs.getNoOutput();
+    }
+
+    if (arc.flag(BIT_STOP_NODE)) {
+      if (arc.flag(BIT_FINAL_ARC)) {
+        arc.target = FINAL_END_NODE;
+      } else {
+        arc.target = NON_FINAL_END_NODE;
+      }
+      arc.nextArc = in.getPosition(); // Only useful for list.
+    } else if (arc.flag(BIT_TARGET_NEXT)) {
+      arc.nextArc = in.getPosition(); // Only useful for list.
+      // TODO: would be nice to make this lazy -- maybe
+      // caller doesn't need the target and is scanning arcs...
+      if (!arc.flag(BIT_LAST_ARC)) {
+        if (arc.bytesPerArc() == 0) {
+          // must scan
+          seekToNextNode(in);
+        } else {
+          int numArcs =
+              arc.nodeFlags == ARCS_FOR_DIRECT_ADDRESSING
+                  ? BitTable.countBits(arc, in)
+                  : arc.numArcs();
+          in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * (long) numArcs);
+        }
+      }
+      arc.target = in.getPosition();
+    } else {
+      arc.target = readUnpackedNodeTarget(in);
+      arc.nextArc = in.getPosition(); // Only useful for list.
+    }
+    return arc;
+  }
+
+  static PrimitiveLongArc readEndArc(PrimitiveLongArc follow, PrimitiveLongArc arc) {
+    if (follow.isFinal()) {
+      if (follow.target() <= 0) {
+        arc.flags = PrimitiveLongFST.BIT_LAST_ARC;
+      } else {
+        arc.flags = 0;
+        // NOTE: nextArc is a node (not an address!) in this case:
+        arc.nextArc = follow.target();
+      }
+      arc.output = follow.nextFinalOutput();
+      arc.label = PrimitiveLongFST.END_LABEL;
+      return arc;
+    } else {
+      return null;
+    }
+  }
+
+  // TODO: could we somehow [partially] tableize arc lookups
+  // like automaton?
+
+  /**
+   * Finds an arc leaving the incoming arc, replacing the arc in place. This returns null if the arc
+   * was not found, else the incoming arc.
+   */
+  public PrimitiveLongArc findTargetArc(
+      int labelToMatch, PrimitiveLongArc follow, PrimitiveLongArc arc, BytesReader in)
+      throws IOException {
+
+    if (labelToMatch == END_LABEL) {
+      if (follow.isFinal()) {
+        if (follow.target() <= 0) {
+          arc.flags = BIT_LAST_ARC;
+        } else {
+          arc.flags = 0;
+          // NOTE: nextArc is a node (not an address!) in this case:
+          arc.nextArc = follow.target();
+        }
+        arc.output = follow.nextFinalOutput();
+        arc.label = END_LABEL;
+        arc.nodeFlags = arc.flags;
+        return arc;
+      } else {
+        return null;
+      }
+    }
+
+    if (!targetHasArcs(follow)) {
+      return null;
+    }
+
+    in.setPosition(follow.target());
+
+    // System.out.println("fta label=" + (char) labelToMatch);
+
+    byte flags = arc.nodeFlags = in.readByte();
+    if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
+      arc.numArcs = in.readVInt(); // This is in fact the label range.
+      arc.bytesPerArc = in.readVInt();
+      readPresenceBytes(arc, in);
+      arc.firstLabel = readLabel(in);
+      arc.posArcsStart = in.getPosition();
+
+      int arcIndex = labelToMatch - arc.firstLabel();
+      if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
+        return null; // Before or after label range.
+      } else if (!BitTable.isBitSet(arcIndex, arc, in)) {
+        return null; // Arc missing in the range.
+      }
+      return readArcByDirectAddressing(arc, in, arcIndex);
+    } else if (flags == ARCS_FOR_BINARY_SEARCH) {
+      arc.numArcs = in.readVInt();
+      arc.bytesPerArc = in.readVInt();
+      arc.posArcsStart = in.getPosition();
+
+      // Array is sparse; do binary search:
+      int low = 0;
+      int high = arc.numArcs() - 1;
+      while (low <= high) {
+        // System.out.println("    cycle");
+        int mid = (low + high) >>> 1;
+        // +1 to skip over flags
+        in.setPosition(arc.posArcsStart() - (arc.bytesPerArc() * mid + 1));
+        int midLabel = readLabel(in);
+        final int cmp = midLabel - labelToMatch;
+        if (cmp < 0) {
+          low = mid + 1;
+        } else if (cmp > 0) {
+          high = mid - 1;
+        } else {
+          arc.arcIdx = mid - 1;
+          // System.out.println("    found!");
+          return readNextRealArc(arc, in);
+        }
+      }
+      return null;
+    } else if (flags == ARCS_FOR_CONTINUOUS) {
+      arc.numArcs = in.readVInt();
+      arc.bytesPerArc = in.readVInt();
+      arc.firstLabel = readLabel(in);
+      arc.posArcsStart = in.getPosition();
+      int arcIndex = labelToMatch - arc.firstLabel();
+      if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
+        return null; // Before or after label range.
+      }
+      arc.arcIdx = arcIndex - 1;
+      return readNextRealArc(arc, in);
+    }
+
+    // Linear scan
+    readFirstArcInfo(follow.target(), arc, in);
+    in.setPosition(arc.nextArc());
+    while (true) {
+      assert arc.bytesPerArc() == 0;
+      flags = arc.flags = in.readByte();
+      long pos = in.getPosition();
+      int label = readLabel(in);
+      if (label == labelToMatch) {
+        in.setPosition(pos);
+        return readArc(arc, in);
+      } else if (label > labelToMatch) {
+        return null;
+      } else if (arc.isLast()) {
+        return null;
+      } else {
+        if (flag(flags, BIT_ARC_HAS_OUTPUT)) {
+          outputs.skipOutput(in);
+        }
+        if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) {
+          outputs.skipFinalOutput(in);
+        }
+        if (flag(flags, BIT_STOP_NODE) == false && flag(flags, BIT_TARGET_NEXT) == false) {
+          readUnpackedNodeTarget(in);
+        }
+      }
+    }
+  }
+
+  private void seekToNextNode(BytesReader in) throws IOException {
+
+    while (true) {
+
+      final int flags = in.readByte();
+      readLabel(in);
+
+      if (flag(flags, BIT_ARC_HAS_OUTPUT)) {
+        outputs.skipOutput(in);
+      }
+
+      if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) {
+        outputs.skipFinalOutput(in);
+      }
+
+      if (flag(flags, BIT_STOP_NODE) == false && flag(flags, BIT_TARGET_NEXT) == false) {
+        readUnpackedNodeTarget(in);
+      }
+
+      if (flag(flags, BIT_LAST_ARC)) {
+        return;
+      }
+    }
+  }
+
+  /** Returns a {@link BytesReader} for this FST, positioned at position 0. */
+  public BytesReader getBytesReader() {
+    return fstReader.getReverseBytesReader();
+  }
+
+  /** Represent the FST metadata */
+  public static final class PrimitiveLongFSTMetadata {
+    final INPUT_TYPE inputType;
+    final PrimitiveLongFSTOutputs outputs;
+    final int version;
+    // if non-null, this FST accepts the empty string and
+    // produces this output
+    Long emptyOutput;
+    long startNode;
+    long numBytes;
+
+    public PrimitiveLongFSTMetadata(
+        INPUT_TYPE inputType,
+        PrimitiveLongFSTOutputs outputs,
+        Long emptyOutput,
+        long startNode,
+        int version,
+        long numBytes) {
+      this.inputType = inputType;
+      this.outputs = outputs;
+      this.emptyOutput = emptyOutput;
+      this.startNode = startNode;
+      this.version = version;
+      this.numBytes = numBytes;
+    }
+  }
+
+  public static class PrimitiveLongFSTOutputs {
+
+    private static final long NO_OUTPUT = 0L;
+
+    private static final PrimitiveLongFSTOutputs singleton = new PrimitiveLongFSTOutputs();
+
+    private PrimitiveLongFSTOutputs() {}
+
+    public static PrimitiveLongFSTOutputs getSingleton() {
+      return singleton;
+    }
+
+    public long common(long output1, long output2) {
+      assert valid(output1);
+      assert valid(output2);
+      if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) {
+        return NO_OUTPUT;
+      } else {
+        assert output1 > 0;
+        assert output2 > 0;
+        return Math.min(output1, output2);
+      }
+    }
+
+    public long subtract(long output, long inc) {
+      assert valid(output);
+      assert valid(inc);
+      assert output >= inc;
+
+      if (inc == NO_OUTPUT) {
+        return output;
+      } else if (output == inc) {
+        return NO_OUTPUT;
+      } else {
+        return output - inc;
+      }
+    }
+
+    public long add(long prefix, long output) {
+      assert valid(prefix);
+      assert valid(output);
+      if (prefix == NO_OUTPUT) {
+        return output;
+      } else if (output == NO_OUTPUT) {
+        return prefix;
+      } else {
+        return prefix + output;
+      }
+    }
+
+    public void write(long output, DataOutput out) throws IOException {
+      assert valid(output);
+      out.writeVLong(output);
+    }
+
+    public long read(DataInput in) throws IOException {
+      long v = in.readVLong();
+      if (v == 0) {
+        return NO_OUTPUT;
+      } else {
+        return v;
+      }
+    }
+
+    private boolean valid(long o) {
+      assert o == NO_OUTPUT || o > 0 : "o=" + o;
+      return true;
+    }
+
+    public long getNoOutput() {
+      return NO_OUTPUT;
+    }
+
+    public String outputToString(long output) {
+      return Long.toString(output);
+    }
+
+    public String toString() {
+      return "PrimitiveLongFSTOutputs";
+    }
+
+    public long ramBytesUsed(Long output) {
+      return RamUsageEstimator.sizeOf(output);
+    }
+
+    public void skipOutput(BytesReader in) throws IOException {
+      read(in);
+    }
+
+    public void skipFinalOutput(BytesReader in) throws IOException {
+      read(in);
+    }
+
+    public long readFinalOutput(BytesReader in) throws IOException {
+      return read(in);
+    }
+
+    public void writeFinalOutput(long output, DataOutput out) throws IOException {
+      write(output, out);
+    }
+  }
+
+  public static long get(PrimitiveLongFST primitiveLongFST, BytesRef input) throws IOException {
+    assert primitiveLongFST.metadata.inputType == PrimitiveLongFST.INPUT_TYPE.BYTE1;
+
+    final BytesReader fstReader = primitiveLongFST.getBytesReader();
+
+    // TODO: would be nice not to alloc this on every lookup
+    final PrimitiveLongArc arc = primitiveLongFST.getFirstArc(new PrimitiveLongArc());
+
+    // Accumulate output as we go
+    long output = primitiveLongFST.outputs.getNoOutput();
+    for (int i = 0; i < input.length; i++) {
+      if (primitiveLongFST.findTargetArc(input.bytes[i + input.offset] & 0xFF, arc, arc, fstReader)
+          == null) {
+        return -1;
+      }
+      output = primitiveLongFST.outputs.add(output, arc.output());
+    }
+
+    if (arc.isFinal()) {
+      return primitiveLongFST.outputs.add(output, arc.nextFinalOutput());
+    } else {
+      return -1;
+    }
+  }
+}

From e06665ece54e28e199d0d39dba9b1638a52da30e Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sun, 26 Nov 2023 10:42:04 -0800
Subject: [PATCH 45/57] Allocate only one set of buffers in TermDataReader
 instead of one set per type

---
 .../codecs/lucene99/randomaccess/TermData.java  |  5 +++--
 .../randomaccess/TermDataReaderProvider.java    | 17 +++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
index c72bef50451e..1b9a8c7406d8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
@@ -53,8 +53,9 @@ IntBlockTermState getTermStateWithBuffer(
     long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8);
     long dataStartPos = metadata.getLong(metadataStartPos);
 
-    metadata.readBytesTo(metaDataBuffer, metadataStartPos + 8, codec.getMetadataBytesLength());
-    BytesRef metadataBytesRef = new BytesRef(metaDataBuffer);
+    int metadataLength = codec.getMetadataBytesLength();
+    metadata.readBytesTo(metaDataBuffer, metadataStartPos + 8, metadataLength);
+    BytesRef metadataBytesRef = new BytesRef(metaDataBuffer, 0, metadataLength);
 
     int numBitsPerRecord = codec.getNumBitsPerRecord(metadataBytesRef);
     int dataBitIndex = numBitsPerRecord * ((int) (ord % TermDataWriter.NUM_TERMS_PER_BLOCK));
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
index a65e9b1304c5..45ba2b00b7c4 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
@@ -76,14 +76,14 @@ record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCode
   public class TermDataReader {
     private final TermData[] termDataPerType;
 
-    private final byte[][] metaDataBufferPerType;
+    private final byte[] metaDataBuffer;
 
-    private final byte[][] dataBufferPerType;
+    private final byte[] dataBuffer;
 
     TermDataReader() throws IOException {
       termDataPerType = new TermData[termDataProviderAndCodecs.length];
-      metaDataBufferPerType = new byte[termDataProviderAndCodecs.length][];
-      dataBufferPerType = new byte[termDataProviderAndCodecs.length][];
+      int maxMetadataLengthSeen = 0;
+      int maxDataLengthSeen = 0;
 
       for (int i = 0; i < termDataProviderAndCodecs.length; i++) {
         if (termDataProviderAndCodecs[i] == null) {
@@ -95,9 +95,11 @@ public class TermDataReader {
             new TermData(
                 termDataProvider.metadataProvider().newByteSlice(),
                 termDataProvider.dataProvider().newByteSlice());
-        metaDataBufferPerType[i] = new byte[codec.getMetadataBytesLength()];
-        dataBufferPerType[i] = new byte[codec.getMaximumRecordSizeInBytes()];
+        maxMetadataLengthSeen = Math.max(maxDataLengthSeen, codec.getMetadataBytesLength());
+        maxDataLengthSeen = Math.max(maxMetadataLengthSeen, codec.getMaximumRecordSizeInBytes());
       }
+      metaDataBuffer = new byte[maxMetadataLengthSeen];
+      dataBuffer = new byte[maxDataLengthSeen];
     }
 
     IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions)
@@ -108,8 +110,7 @@ IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOp
       int typeId = termType.getId();
       var codec = termDataProviderAndCodecs[termType.getId()].codec;
       IntBlockTermState termState =
-          termDataPerType[typeId].getTermStateWithBuffer(
-              codec, ord, metaDataBufferPerType[typeId], dataBufferPerType[typeId]);
+          termDataPerType[typeId].getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer);
 
       // need to filling some default values for the term state
       // in order to meet the expectations of the postings reader

From 35af1d2cc969f95879966a9c2c5d37a4d276cbfa Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sun, 26 Nov 2023 11:00:32 -0800
Subject: [PATCH 46/57] Make TermDataReader lazily init its buffer and clone
 IndexInput

---
 .../randomaccess/TermDataReaderProvider.java  | 59 +++++++++++--------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
index 45ba2b00b7c4..633c44cef09f 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
@@ -74,43 +74,54 @@ TermDataReaderProvider build() {
   record TermDataProviderAndCodec(TermDataProvider termDataProvider, TermStateCodec codec) {}
 
   public class TermDataReader {
-    private final TermData[] termDataPerType;
-
-    private final byte[] metaDataBuffer;
-
-    private final byte[] dataBuffer;
-
-    TermDataReader() throws IOException {
-      termDataPerType = new TermData[termDataProviderAndCodecs.length];
-      int maxMetadataLengthSeen = 0;
-      int maxDataLengthSeen = 0;
-
-      for (int i = 0; i < termDataProviderAndCodecs.length; i++) {
-        if (termDataProviderAndCodecs[i] == null) {
-          continue;
+    private TermData[] termDataPerType;
+
+    private byte[] metaDataBuffer;
+
+    private byte[] dataBuffer;
+
+    void maybeInitBuffer() {
+      if (metaDataBuffer == null || dataBuffer == null) {
+        int maxMetadataLengthSeen = 0;
+        int maxDataLengthSeen = 0;
+        for (int i = 0; i < termDataProviderAndCodecs.length; i++) {
+          if (termDataProviderAndCodecs[i] == null) {
+            continue;
+          }
+          var codec = termDataProviderAndCodecs[i].codec;
+          maxMetadataLengthSeen = Math.max(maxDataLengthSeen, codec.getMetadataBytesLength());
+          maxDataLengthSeen = Math.max(maxMetadataLengthSeen, codec.getMaximumRecordSizeInBytes());
         }
-        var codec = termDataProviderAndCodecs[i].codec;
-        TermDataProvider termDataProvider = termDataProviderAndCodecs[i].termDataProvider;
-        termDataPerType[i] =
+        metaDataBuffer = new byte[maxMetadataLengthSeen];
+        dataBuffer = new byte[maxDataLengthSeen];
+      }
+    }
+
+    TermData getTermData(int typeId) throws IOException {
+      if (termDataPerType == null) {
+        termDataPerType = new TermData[termDataProviderAndCodecs.length];
+      }
+      if (termDataPerType[typeId] == null) {
+        TermDataProvider termDataProvider = termDataProviderAndCodecs[typeId].termDataProvider;
+        termDataPerType[typeId] =
             new TermData(
                 termDataProvider.metadataProvider().newByteSlice(),
                 termDataProvider.dataProvider().newByteSlice());
-        maxMetadataLengthSeen = Math.max(maxDataLengthSeen, codec.getMetadataBytesLength());
-        maxDataLengthSeen = Math.max(maxMetadataLengthSeen, codec.getMaximumRecordSizeInBytes());
       }
-      metaDataBuffer = new byte[maxMetadataLengthSeen];
-      dataBuffer = new byte[maxDataLengthSeen];
+      return termDataPerType[typeId];
     }
 
     IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOptions)
         throws IOException {
       assert termDataProviderAndCodecs[termType.getId()] != null;
-      assert termDataPerType[termType.getId()] != null;
+
+      maybeInitBuffer();
 
       int typeId = termType.getId();
-      var codec = termDataProviderAndCodecs[termType.getId()].codec;
+      var codec = termDataProviderAndCodecs[typeId].codec;
+      var termData = getTermData(typeId);
       IntBlockTermState termState =
-          termDataPerType[typeId].getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer);
+          termData.getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer);
 
       // need to filling some default values for the term state
       // in order to meet the expectations of the postings reader

From 79c0fb32fede7f5932055c217f504bc950092f7e Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sun, 26 Nov 2023 22:33:47 -0800
Subject: [PATCH 47/57] Implement BytesRefPrimitiveLongFSTEnum that works with
 a primitive long FST

---
 .../fst/BytesRefPrimitiveLongFSTEnum.java     | 125 +++
 .../lucene/util/fst/PrimitiveLongFSTEnum.java | 758 ++++++++++++++++++
 .../java/org/apache/lucene/util/fst/Util.java |  29 +
 3 files changed, 912 insertions(+)
 create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java
 create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java

diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java
new file mode 100644
index 000000000000..af34576b35b1
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.io.IOException;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Enumerates all input (BytesRef) + output pairs in a {@link PrimitiveLongFST}.
+ *
+ * @lucene.experimental
+ */
+public final class BytesRefPrimitiveLongFSTEnum extends PrimitiveLongFSTEnum {
+  private final BytesRef current = new BytesRef(10);
+  private final InputOutput result = new InputOutput();
+  private BytesRef target;
+
+  /** Holds a single input (BytesRef) + output pair. */
+  public static class InputOutput {
+    public BytesRef input;
+    public long output;
+  }
+
+  /**
+   * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to
+   * the biggest term before target.
+   */
+  public BytesRefPrimitiveLongFSTEnum(PrimitiveLongFST fst) {
+    super(fst);
+    result.input = current;
+    current.offset = 1;
+  }
+
+  public InputOutput current() {
+    return result;
+  }
+
+  public InputOutput next() throws IOException {
+    // System.out.println("  enum.next");
+    doNext();
+    return setResult();
+  }
+
+  /** Seeks to smallest term that's &gt;= target. */
+  public InputOutput seekCeil(BytesRef target) throws IOException {
+    this.target = target;
+    targetLength = target.length;
+    super.doSeekCeil();
+    return setResult();
+  }
+
+  /** Seeks to biggest term that's &lt;= target. */
+  public InputOutput seekFloor(BytesRef target) throws IOException {
+    this.target = target;
+    targetLength = target.length;
+    super.doSeekFloor();
+    return setResult();
+  }
+
+  /**
+   * Seeks to exactly this term, returning null if the term doesn't exist. This is faster than using
+   * {@link #seekFloor} or {@link #seekCeil} because it short-circuits as soon the match is not
+   * found.
+   */
+  public InputOutput seekExact(BytesRef target) throws IOException {
+    this.target = target;
+    targetLength = target.length;
+    if (doSeekExact()) {
+      assert upto == 1 + target.length;
+      return setResult();
+    } else {
+      return null;
+    }
+  }
+
+  @Override
+  protected int getTargetLabel() {
+    if (upto - 1 == target.length) {
+      return FST.END_LABEL;
+    } else {
+      return target.bytes[target.offset + upto - 1] & 0xFF;
+    }
+  }
+
+  @Override
+  protected int getCurrentLabel() {
+    // current.offset fixed at 1
+    return current.bytes[upto] & 0xFF;
+  }
+
+  @Override
+  protected void setCurrentLabel(int label) {
+    current.bytes[upto] = (byte) label;
+  }
+
+  @Override
+  protected void grow() {
+    current.bytes = ArrayUtil.grow(current.bytes, upto + 1);
+  }
+
+  private InputOutput setResult() {
+    if (upto == 0) {
+      result.output = -1;
+    } else {
+      current.length = upto - 1;
+      result.output = output[upto];
+    }
+    return result;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java
new file mode 100644
index 000000000000..85c0815f964a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java
@@ -0,0 +1,758 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import static org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc.BitTable;
+
+import java.io.IOException;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc;
+
+/**
+ * Can next() and advance() through the terms in an {@link PrimitiveLongFST}
+ *
+ * @lucene.experimental
+ */
+abstract class PrimitiveLongFSTEnum {
+  protected final PrimitiveLongFST fst;
+
+  protected PrimitiveLongArc[] arcs = new PrimitiveLongArc[10];
+
+  protected long[] output = new long[10];
+
+  protected final long NO_OUTPUT;
+  protected final FST.BytesReader fstReader;
+
+  protected int upto;
+  int targetLength;
+
+  /**
+   * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to
+   * the biggest term before target.
+   */
+  PrimitiveLongFSTEnum(PrimitiveLongFST fst) {
+    this.fst = fst;
+    fstReader = fst.getBytesReader();
+    NO_OUTPUT = fst.outputs.getNoOutput();
+    fst.getFirstArc(getArc(0));
+    output[0] = NO_OUTPUT;
+  }
+
+  protected abstract int getTargetLabel();
+
+  protected abstract int getCurrentLabel();
+
+  protected abstract void setCurrentLabel(int label);
+
+  protected abstract void grow();
+
+  /** Rewinds enum state to match the shared prefix between current term and target term */
+  private void rewindPrefix() throws IOException {
+    if (upto == 0) {
+      // System.out.println("  init");
+      upto = 1;
+      fst.readFirstTargetArc(getArc(0), getArc(1), fstReader);
+      return;
+    }
+    // System.out.println("  rewind upto=" + upto + " vs targetLength=" + targetLength);
+
+    final int currentLimit = upto;
+    upto = 1;
+    while (upto < currentLimit && upto <= targetLength + 1) {
+      final int cmp = getCurrentLabel() - getTargetLabel();
+      if (cmp < 0) {
+        // seek forward
+        // System.out.println("    seek fwd");
+        break;
+      } else if (cmp > 0) {
+        // seek backwards -- reset this arc to the first arc
+        final PrimitiveLongArc arc = getArc(upto);
+        fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader);
+        // System.out.println("    seek first arc");
+        break;
+      }
+      upto++;
+    }
+    // System.out.println("  fall through upto=" + upto);
+  }
+
+  protected void doNext() throws IOException {
+    // System.out.println("FE: next upto=" + upto);
+    if (upto == 0) {
+      // System.out.println("  init");
+      upto = 1;
+      fst.readFirstTargetArc(getArc(0), getArc(1), fstReader);
+    } else {
+      // pop
+      // System.out.println("  check pop curArc target=" + arcs[upto].target + " label=" +
+      // arcs[upto].label + " isLast?=" + arcs[upto].isLast());
+      while (arcs[upto].isLast()) {
+        upto--;
+        if (upto == 0) {
+          // System.out.println("  eof");
+          return;
+        }
+      }
+      fst.readNextArc(arcs[upto], fstReader);
+    }
+
+    pushFirst();
+  }
+
+  // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
+  // SEEK_END)?  saves the eq check above?
+
+  /** Seeks to smallest term that's &gt;= target. */
+  protected void doSeekCeil() throws IOException {
+
+    // System.out.println("    advance len=" + target.length + " curlen=" + current.length);
+
+    // TODO: possibly caller could/should provide common
+    // prefix length?  ie this work may be redundant if
+    // caller is in fact intersecting against its own
+    // automaton
+
+    // System.out.println("FE.seekCeil upto=" + upto);
+
+    // Save time by starting at the end of the shared prefix
+    // b/w our current term & the target:
+    rewindPrefix();
+    // System.out.println("  after rewind upto=" + upto);
+
+    PrimitiveLongArc arc = getArc(upto);
+    // System.out.println("  init targetLabel=" + targetLabel);
+
+    // Now scan forward, matching the new suffix of the target
+    while (arc != null) {
+      int targetLabel = getTargetLabel();
+      // System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char)
+      // arc.label + ") vs targetLabel=" + targetLabel);
+      if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
+        // Arcs are in an array
+        final FST.BytesReader in = fst.getBytesReader();
+        if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
+          arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
+        } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
+          arc = doSeekCeilArrayPacked(arc, targetLabel, in);
+        } else {
+          assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
+          arc = doSeekCeilArrayContinuous(arc, targetLabel, in);
+        }
+      } else {
+        arc = doSeekCeilList(arc, targetLabel);
+      }
+    }
+  }
+
+  private PrimitiveLongArc doSeekCeilArrayContinuous(
+      final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in)
+      throws IOException {
+    int targetIndex = targetLabel - arc.firstLabel();
+    if (targetIndex >= arc.numArcs()) {
+      rollbackToLastForkThenPush();
+      return null;
+    } else {
+      if (targetIndex < 0) {
+        fst.readArcByContinuous(arc, in, 0);
+        assert arc.label() > targetLabel;
+        pushFirst();
+        return null;
+      } else {
+        fst.readArcByContinuous(arc, in, targetIndex);
+        assert arc.label() == targetLabel;
+        // found -- copy pasta from below
+        output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+        if (targetLabel == FST.END_LABEL) {
+          return null;
+        }
+        setCurrentLabel(arc.label());
+        incr();
+        return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+      }
+    }
+  }
+
+  private PrimitiveLongArc doSeekCeilArrayDirectAddressing(
+      final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in)
+      throws IOException {
+    // The array is addressed directly by label, with presence bits to compute the actual arc
+    // offset.
+
+    int targetIndex = targetLabel - arc.firstLabel();
+    if (targetIndex >= arc.numArcs()) {
+      rollbackToLastForkThenPush();
+      return null;
+    } else {
+      if (targetIndex < 0) {
+        targetIndex = -1;
+      } else if (BitTable.isBitSet(targetIndex, arc, in)) {
+        fst.readArcByDirectAddressing(arc, in, targetIndex);
+        assert arc.label() == targetLabel;
+        // found -- copy pasta from below
+        output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+        if (targetLabel == FST.END_LABEL) {
+          return null;
+        }
+        setCurrentLabel(arc.label());
+        incr();
+        return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+      }
+      // Not found, return the next arc (ceil).
+      int ceilIndex = BitTable.nextBitSet(targetIndex, arc, in);
+      assert ceilIndex != -1;
+      fst.readArcByDirectAddressing(arc, in, ceilIndex);
+      assert arc.label() > targetLabel;
+      pushFirst();
+      return null;
+    }
+  }
+
+  private PrimitiveLongArc doSeekCeilArrayPacked(
+      final PrimitiveLongArc arc, final int targetLabel, final FST.BytesReader in)
+      throws IOException {
+    // The array is packed -- use binary search to find the target.
+    int idx = Util.binarySearch(fst, arc, targetLabel);
+    if (idx >= 0) {
+      // Match
+      fst.readArcByIndex(arc, in, idx);
+      assert arc.arcIdx() == idx;
+      assert arc.label() == targetLabel
+          : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx;
+      output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+      if (targetLabel == FST.END_LABEL) {
+        return null;
+      }
+      setCurrentLabel(arc.label());
+      incr();
+      return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+    }
+    idx = -1 - idx;
+    if (idx == arc.numArcs()) {
+      // Dead end
+      fst.readArcByIndex(arc, in, idx - 1);
+      assert arc.isLast();
+      // Dead end (target is after the last arc);
+      // rollback to last fork then push
+      upto--;
+      while (true) {
+        if (upto == 0) {
+          return null;
+        }
+        final PrimitiveLongArc prevArc = getArc(upto);
+        // System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + "
+        // isLast?=" + prevArc.isLast());
+        if (!prevArc.isLast()) {
+          fst.readNextArc(prevArc, fstReader);
+          pushFirst();
+          return null;
+        }
+        upto--;
+      }
+    } else {
+      // Ceiling - arc with least higher label
+      fst.readArcByIndex(arc, in, idx);
+      assert arc.label() > targetLabel;
+      pushFirst();
+      return null;
+    }
+  }
+
+  private PrimitiveLongArc doSeekCeilList(final PrimitiveLongArc arc, final int targetLabel)
+      throws IOException {
+    // Arcs are not array'd -- must do linear scan:
+    if (arc.label() == targetLabel) {
+      // recurse
+      output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+      if (targetLabel == FST.END_LABEL) {
+        return null;
+      }
+      setCurrentLabel(arc.label());
+      incr();
+      return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+    } else if (arc.label() > targetLabel) {
+      pushFirst();
+      return null;
+    } else if (arc.isLast()) {
+      // Dead end (target is after the last arc);
+      // rollback to last fork then push
+      upto--;
+      while (true) {
+        if (upto == 0) {
+          return null;
+        }
+        final PrimitiveLongArc prevArc = getArc(upto);
+        // System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + "
+        // isLast?=" + prevArc.isLast());
+        if (!prevArc.isLast()) {
+          fst.readNextArc(prevArc, fstReader);
+          pushFirst();
+          return null;
+        }
+        upto--;
+      }
+    } else {
+      // keep scanning
+      // System.out.println("    next scan");
+      fst.readNextArc(arc, fstReader);
+    }
+    return arc;
+  }
+
+  // Todo: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
+  // SEEK_END)?  saves the eq check above?
+  /** Seeks to largest term that's &lt;= target. */
+  void doSeekFloor() throws IOException {
+
+    // TODO: possibly caller could/should provide common
+    // prefix length?  ie this work may be redundant if
+    // caller is in fact intersecting against its own
+    // automaton
+    // System.out.println("FE: seek floor upto=" + upto);
+
+    // Save CPU by starting at the end of the shared prefix
+    // b/w our current term & the target:
+    rewindPrefix();
+
+    // System.out.println("FE: after rewind upto=" + upto);
+
+    PrimitiveLongArc arc = getArc(upto);
+
+    // System.out.println("FE: init targetLabel=" + targetLabel);
+
+    // Now scan forward, matching the new suffix of the target
+    while (arc != null) {
+      // System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char)
+      // arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" +
+      // arc.bytesPerArc);
+      int targetLabel = getTargetLabel();
+
+      if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
+        // Arcs are in an array
+        final FST.BytesReader in = fst.getBytesReader();
+        if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
+          arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
+        } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
+          arc = doSeekFloorArrayPacked(arc, targetLabel, in);
+        } else {
+          assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
+          arc = doSeekFloorContinuous(arc, targetLabel, in);
+        }
+      } else {
+        arc = doSeekFloorList(arc, targetLabel);
+      }
+    }
+  }
+
+  private PrimitiveLongArc doSeekFloorContinuous(
+      PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException {
+    int targetIndex = targetLabel - arc.firstLabel();
+    if (targetIndex < 0) {
+      // Before first arc.
+      return backtrackToFloorArc(arc, targetLabel, in);
+    } else if (targetIndex >= arc.numArcs()) {
+      // After last arc.
+      fst.readLastArcByContinuous(arc, in);
+      assert arc.label() < targetLabel;
+      assert arc.isLast();
+      pushLast();
+      return null;
+    } else {
+      // Within label range.
+      fst.readArcByContinuous(arc, in, targetIndex);
+      assert arc.label() == targetLabel;
+      // found -- copy pasta from below
+      output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+      if (targetLabel == FST.END_LABEL) {
+        return null;
+      }
+      setCurrentLabel(arc.label());
+      incr();
+      return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+    }
+  }
+
+  private PrimitiveLongArc doSeekFloorArrayDirectAddressing(
+      PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException {
+    // The array is addressed directly by label, with presence bits to compute the actual arc
+    // offset.
+
+    int targetIndex = targetLabel - arc.firstLabel();
+    if (targetIndex < 0) {
+      // Before first arc.
+      return backtrackToFloorArc(arc, targetLabel, in);
+    } else if (targetIndex >= arc.numArcs()) {
+      // After last arc.
+      fst.readLastArcByDirectAddressing(arc, in);
+      assert arc.label() < targetLabel;
+      assert arc.isLast();
+      pushLast();
+      return null;
+    } else {
+      // Within label range.
+      if (BitTable.isBitSet(targetIndex, arc, in)) {
+        fst.readArcByDirectAddressing(arc, in, targetIndex);
+        assert arc.label() == targetLabel;
+        // found -- copy pasta from below
+        output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+        if (targetLabel == FST.END_LABEL) {
+          return null;
+        }
+        setCurrentLabel(arc.label());
+        incr();
+        return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+      }
+      // Scan backwards to find a floor arc.
+      int floorIndex = BitTable.previousBitSet(targetIndex, arc, in);
+      assert floorIndex != -1;
+      fst.readArcByDirectAddressing(arc, in, floorIndex);
+      assert arc.label() < targetLabel;
+      assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
+      pushLast();
+      return null;
+    }
+  }
+
+  /**
+   * Target is beyond the last arc, out of label range. Dead end (target is after the last arc);
+   * rollback to last fork then push
+   */
+  private void rollbackToLastForkThenPush() throws IOException {
+    upto--;
+    while (true) {
+      if (upto == 0) {
+        return;
+      }
+      final PrimitiveLongArc prevArc = getArc(upto);
+      // System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + "
+      // isLast?=" + prevArc.isLast());
+      if (!prevArc.isLast()) {
+        fst.readNextArc(prevArc, fstReader);
+        pushFirst();
+        return;
+      }
+      upto--;
+    }
+  }
+
+  /**
+   * Backtracks until it finds a node which first arc is before our target label.` Then on the node,
+   * finds the arc just before the targetLabel.
+   *
+   * @return null to continue the seek floor recursion loop.
+   */
+  private PrimitiveLongArc backtrackToFloorArc(
+      PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException {
+    while (true) {
+      // First, walk backwards until we find a node which first arc is before our target label.
+      fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader);
+      if (arc.label() < targetLabel) {
+        // Then on this node, find the arc just before the targetLabel.
+        if (!arc.isLast()) {
+          if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
+            if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
+              findNextFloorArcBinarySearch(arc, targetLabel, in);
+            } else if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
+              findNextFloorArcDirectAddressing(arc, targetLabel, in);
+            } else {
+              assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
+              findNextFloorArcContinuous(arc, targetLabel, in);
+            }
+          } else {
+            while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
+              fst.readNextArc(arc, fstReader);
+            }
+          }
+        }
+        assert arc.label() < targetLabel;
+        assert arc.isLast() || fst.readNextArcLabel(arc, in) >= targetLabel;
+        pushLast();
+        return null;
+      }
+      upto--;
+      if (upto == 0) {
+        return null;
+      }
+      targetLabel = getTargetLabel();
+      arc = getArc(upto);
+    }
+  }
+
+  /**
+   * Finds and reads an arc on the current node which label is strictly less than the given label.
+   * Skips the first arc, finds next floor arc; or none if the floor arc is the first arc itself (in
+   * this case it has already been read).
+   *
+   * <p>Precondition: the given arc is the first arc of the node.
+   */
+  private void findNextFloorArcDirectAddressing(
+      PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException {
+    assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
+    assert arc.label() != FST.END_LABEL;
+    assert arc.label() == arc.firstLabel();
+    if (arc.numArcs() > 1) {
+      int targetIndex = targetLabel - arc.firstLabel();
+      assert targetIndex >= 0;
+      if (targetIndex >= arc.numArcs()) {
+        // Beyond last arc. Take last arc.
+        fst.readLastArcByDirectAddressing(arc, in);
+      } else {
+        // Take the preceding arc, even if the target is present.
+        int floorIndex = BitTable.previousBitSet(targetIndex, arc, in);
+        if (floorIndex > 0) {
+          fst.readArcByDirectAddressing(arc, in, floorIndex);
+        }
+      }
+    }
+  }
+
+  /** Same as {@link #findNextFloorArcDirectAddressing} for continuous node. */
+  private void findNextFloorArcContinuous(
+      PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException {
+    assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
+    assert arc.label() != FST.END_LABEL;
+    assert arc.label() == arc.firstLabel();
+    if (arc.numArcs() > 1) {
+      int targetIndex = targetLabel - arc.firstLabel();
+      assert targetIndex >= 0;
+      if (targetIndex >= arc.numArcs()) {
+        // Beyond last arc. Take last arc.
+        fst.readLastArcByContinuous(arc, in);
+      } else {
+        fst.readArcByContinuous(arc, in, targetIndex - 1);
+      }
+    }
+  }
+
+  /** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */
+  private void findNextFloorArcBinarySearch(
+      PrimitiveLongArc arc, int targetLabel, FST.BytesReader in) throws IOException {
+    assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
+    assert arc.label() != FST.END_LABEL;
+    assert arc.arcIdx() == 0;
+    if (arc.numArcs() > 1) {
+      int idx = Util.binarySearch(fst, arc, targetLabel);
+      assert idx != -1;
+      if (idx > 1) {
+        fst.readArcByIndex(arc, in, idx - 1);
+      } else if (idx < -2) {
+        fst.readArcByIndex(arc, in, -2 - idx);
+      }
+    }
+  }
+
+  private PrimitiveLongArc doSeekFloorArrayPacked(
+      PrimitiveLongArc arc, int targetLabel, final FST.BytesReader in) throws IOException {
+    // Arcs are fixed array -- use binary search to find the target.
+    int idx = Util.binarySearch(fst, arc, targetLabel);
+
+    if (idx >= 0) {
+      // Match -- recurse
+      // System.out.println("  match!  arcIdx=" + idx);
+      fst.readArcByIndex(arc, in, idx);
+      assert arc.arcIdx() == idx;
+      assert arc.label() == targetLabel
+          : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx;
+      output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+      if (targetLabel == FST.END_LABEL) {
+        return null;
+      }
+      setCurrentLabel(arc.label());
+      incr();
+      return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+    } else if (idx == -1) {
+      // Before first arc.
+      return backtrackToFloorArc(arc, targetLabel, in);
+    } else {
+      // There is a floor arc; idx will be (-1 - (floor + 1)).
+      fst.readArcByIndex(arc, in, -2 - idx);
+      assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
+      assert arc.label() < targetLabel
+          : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
+      pushLast();
+      return null;
+    }
+  }
+
+  private PrimitiveLongArc doSeekFloorList(PrimitiveLongArc arc, int targetLabel)
+      throws IOException {
+    if (arc.label() == targetLabel) {
+      // Match -- recurse
+      output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+      if (targetLabel == FST.END_LABEL) {
+        return null;
+      }
+      setCurrentLabel(arc.label());
+      incr();
+      return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+    } else if (arc.label() > targetLabel) {
+      // TODO: if each arc could somehow read the arc just
+      // before, we can save this re-scan.  The ceil case
+      // doesn't need this because it reads the next arc
+      // instead:
+      while (true) {
+        // First, walk backwards until we find a first arc
+        // that's before our target label:
+        fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader);
+        if (arc.label() < targetLabel) {
+          // Then, scan forwards to the arc just before
+          // the targetLabel:
+          while (!arc.isLast() && fst.readNextArcLabel(arc, fstReader) < targetLabel) {
+            fst.readNextArc(arc, fstReader);
+          }
+          pushLast();
+          return null;
+        }
+        upto--;
+        if (upto == 0) {
+          return null;
+        }
+        targetLabel = getTargetLabel();
+        arc = getArc(upto);
+      }
+    } else if (!arc.isLast()) {
+      // System.out.println("  check next label=" + fst.readNextArcLabel(arc) + " (" + (char)
+      // fst.readNextArcLabel(arc) + ")");
+      if (fst.readNextArcLabel(arc, fstReader) > targetLabel) {
+        pushLast();
+        return null;
+      } else {
+        // keep scanning
+        return fst.readNextArc(arc, fstReader);
+      }
+    } else {
+      pushLast();
+      return null;
+    }
+  }
+
+  /** Seeks to exactly target term. */
+  boolean doSeekExact() throws IOException {
+
+    // TODO: possibly caller could/should provide common
+    // prefix length?  ie this work may be redundant if
+    // caller is in fact intersecting against its own
+    // automaton
+
+    // System.out.println("FE: seek exact upto=" + upto);
+
+    // Save time by starting at the end of the shared prefix
+    // b/w our current term & the target:
+    rewindPrefix();
+
+    // System.out.println("FE: after rewind upto=" + upto);
+    PrimitiveLongArc arc = getArc(upto - 1);
+    int targetLabel = getTargetLabel();
+
+    final FST.BytesReader fstReader = fst.getBytesReader();
+
+    while (true) {
+      // System.out.println("  cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
+      final PrimitiveLongArc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader);
+      if (nextArc == null) {
+        // short circuit
+        // upto--;
+        // upto = 0;
+        fst.readFirstTargetArc(arc, getArc(upto), fstReader);
+        // System.out.println("  no match upto=" + upto);
+        return false;
+      }
+      // Match -- recurse:
+      output[upto] = fst.outputs.add(output[upto - 1], nextArc.output());
+      if (targetLabel == FST.END_LABEL) {
+        // System.out.println("  return found; upto=" + upto + " output=" + output[upto] + "
+        // nextArc=" + nextArc.isLast());
+        return true;
+      }
+      setCurrentLabel(targetLabel);
+      incr();
+      targetLabel = getTargetLabel();
+      arc = nextArc;
+    }
+  }
+
+  private void incr() {
+    upto++;
+    grow();
+    if (arcs.length <= upto) {
+      @SuppressWarnings({"rawtypes", "unchecked"})
+      final PrimitiveLongArc[] newArcs =
+          new PrimitiveLongArc
+              [ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+      System.arraycopy(arcs, 0, newArcs, 0, arcs.length);
+      arcs = newArcs;
+    }
+    if (output.length <= upto) {
+      @SuppressWarnings({"rawtypes", "unchecked"})
+      final long[] newOutput =
+          new long[ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+      System.arraycopy(output, 0, newOutput, 0, output.length);
+      output = newOutput;
+    }
+  }
+
+  // Appends current arc, and then recurses from its target,
+  // appending first arc all the way to the final node
+  private void pushFirst() throws IOException {
+
+    PrimitiveLongArc arc = arcs[upto];
+    assert arc != null;
+
+    while (true) {
+      output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+      if (arc.label() == FST.END_LABEL) {
+        // Final node
+        break;
+      }
+      // System.out.println("  pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" +
+      // fst.outputs.outputToString(output[upto]));
+      setCurrentLabel(arc.label());
+      incr();
+
+      final PrimitiveLongArc nextArc = getArc(upto);
+      fst.readFirstTargetArc(arc, nextArc, fstReader);
+      arc = nextArc;
+    }
+  }
+
+  // Recurses from current arc, appending last arc all the
+  // way to the first final node
+  private void pushLast() throws IOException {
+
+    PrimitiveLongArc arc = arcs[upto];
+    assert arc != null;
+
+    while (true) {
+      setCurrentLabel(arc.label());
+      output[upto] = fst.outputs.add(output[upto - 1], arc.output());
+      if (arc.label() == FST.END_LABEL) {
+        // Final node
+        break;
+      }
+      incr();
+
+      arc = fst.readLastTargetArc(arc, getArc(upto), fstReader);
+    }
+  }
+
+  private PrimitiveLongArc getArc(int idx) {
+    if (arcs[idx] == null) {
+      arcs[idx] = new PrimitiveLongArc();
+    }
+    return arcs[idx];
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
index 9fdc460d0583..740460679668 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@@ -934,4 +934,33 @@ static <T> int binarySearch(FST<T> fst, FST.Arc<T> arc, int targetLabel) throws
     }
     return -1 - low;
   }
+
+  /** Same as {@link Util#binarySearch(FST, Arc, int)} but for {@link PrimitiveLongFST} */
+  static int binarySearch(
+      PrimitiveLongFST fst, PrimitiveLongFST.PrimitiveLongArc arc, int targetLabel)
+      throws IOException {
+    assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH
+        : "Arc is not encoded as packed array for binary search (nodeFlags="
+            + arc.nodeFlags()
+            + ")";
+    BytesReader in = fst.getBytesReader();
+    int low = arc.arcIdx();
+    int mid;
+    int high = arc.numArcs() - 1;
+    while (low <= high) {
+      mid = (low + high) >>> 1;
+      in.setPosition(arc.posArcsStart());
+      in.skipBytes((long) arc.bytesPerArc() * mid + 1);
+      final int midLabel = fst.readLabel(in);
+      final int cmp = midLabel - targetLabel;
+      if (cmp < 0) {
+        low = mid + 1;
+      } else if (cmp > 0) {
+        high = mid - 1;
+      } else {
+        return mid;
+      }
+    }
+    return -1 - low;
+  }
 }

From b74a05dedef9c13b576b68621d69e83848b5f644 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sun, 26 Nov 2023 23:35:16 -0800
Subject: [PATCH 48/57] Fix getFirstArc() bug in PrimitiveLongFST.

---
 .../lucene/util/fst/PrimitiveLongFST.java     | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java
index c4a188fc58e6..900675090f97 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFST.java
@@ -503,11 +503,10 @@ public void save(DataOutput metaOut, DataOutput out) throws IOException {
    */
   public void saveMetadata(DataOutput metaOut) throws IOException {
     CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT);
-
-    // Accepts empty string
-    metaOut.writeByte((byte) 1);
-
     if (metadata.emptyOutput != null) {
+      // Accepts empty string
+      metaOut.writeByte((byte) 1);
+
       // Serialize empty-string output:
       ByteBuffersDataOutput ros = new ByteBuffersDataOutput();
       outputs.writeFinalOutput(metadata.emptyOutput.longValue(), ros);
@@ -607,14 +606,16 @@ private void readPresenceBytes(PrimitiveLongArc arc, BytesReader in) throws IOEx
   public PrimitiveLongArc getFirstArc(PrimitiveLongArc arc) {
     long NO_OUTPUT = outputs.getNoOutput();
 
-    arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC;
     if (metadata.emptyOutput != null) {
+      arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC;
       arc.nextFinalOutput = metadata.emptyOutput.longValue();
+      if (metadata.emptyOutput.longValue() != NO_OUTPUT) {
+        arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT);
+      }
+    } else {
+      arc.flags = BIT_LAST_ARC;
+      arc.nextFinalOutput = NO_OUTPUT;
     }
-    if (metadata.emptyOutput != null && metadata.emptyOutput.longValue() != NO_OUTPUT) {
-      arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT);
-    }
-
     arc.output = NO_OUTPUT;
 
     // If there are no nodes, ie, the FST only accepts the

From f328e9f11e1eabee2db52068d45bc05cb6249209 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 27 Nov 2023 14:33:37 -0800
Subject: [PATCH 49/57] Reuse single IntBlockTermState in TermDataReader

---
 .../lucene99/randomaccess/TermData.java       | 13 +++++++---
 .../randomaccess/TermDataReaderProvider.java  |  4 +++-
 .../lucene99/randomaccess/TermStateCodec.java | 11 +++++++++
 .../randomaccess/TermStateCodecImpl.java      | 24 +++++++++++++++----
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
index 1b9a8c7406d8..06cf69da9aa1 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermData.java
@@ -47,8 +47,13 @@ IntBlockTermState getTermState(TermStateCodec codec, long ord) throws IOExceptio
     return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex);
   }
 
-  IntBlockTermState getTermStateWithBuffer(
-      TermStateCodec codec, long ord, byte[] metaDataBuffer, byte[] dataBuffer) throws IOException {
+  IntBlockTermState getTermStateWithBufferAndReuse(
+      TermStateCodec codec,
+      long ord,
+      byte[] metaDataBuffer,
+      byte[] dataBuffer,
+      IntBlockTermState reuse)
+      throws IOException {
     long blockId = ord / TermDataWriter.NUM_TERMS_PER_BLOCK;
     long metadataStartPos = blockId * (codec.getMetadataBytesLength() + 8);
     long dataStartPos = metadata.getLong(metadataStartPos);
@@ -67,6 +72,8 @@ IntBlockTermState getTermStateWithBuffer(
     data.readBytesTo(dataBuffer, dataStartPos + dataBitIndex / 8, numBytesToRead);
     BytesRef dataBytesRef = new BytesRef(dataBuffer, 0, numBytesToRead);
 
-    return codec.decodeAt(metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex);
+    codec.decodeAtWithReuse(
+        metadataBytesRef, dataBytesRef, BitUnpackerImpl.INSTANCE, startBitIndex, reuse);
+    return reuse;
   }
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
index 633c44cef09f..7d66fcd6abc6 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermDataReaderProvider.java
@@ -80,6 +80,8 @@ public class TermDataReader {
 
     private byte[] dataBuffer;
 
+    private IntBlockTermState reuse = new IntBlockTermState();
+
     void maybeInitBuffer() {
       if (metaDataBuffer == null || dataBuffer == null) {
         int maxMetadataLengthSeen = 0;
@@ -121,7 +123,7 @@ IntBlockTermState getTermState(TermType termType, long ord, IndexOptions indexOp
       var codec = termDataProviderAndCodecs[typeId].codec;
       var termData = getTermData(typeId);
       IntBlockTermState termState =
-          termData.getTermStateWithBuffer(codec, ord, metaDataBuffer, dataBuffer);
+          termData.getTermStateWithBufferAndReuse(codec, ord, metaDataBuffer, dataBuffer, reuse);
 
       // need to filling some default values for the term state
       // in order to meet the expectations of the postings reader
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
index 1ef79ab7f158..081b5917b3c4 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodec.java
@@ -85,4 +85,15 @@ IntBlockTermState decodeWithinBlock(
    */
   IntBlockTermState decodeAt(
       BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex);
+
+  /**
+   * Like {@link TermStateCodec#decodeAt(BytesRef, BytesRef, BitUnpacker, int)} but with a caller
+   * provided `IntBlockTermState` instead of returning a allocated one.
+   */
+  void decodeAtWithReuse(
+      BytesRef metadataBytes,
+      BytesRef dataBytes,
+      BitUnpacker bitUnpacker,
+      int startBitIndex,
+      IntBlockTermState reuse);
 }
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
index adef80cba696..15fa3cbd9dde 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermStateCodecImpl.java
@@ -212,11 +212,27 @@ public IntBlockTermState decodeWithinBlock(
   @Override
   public IntBlockTermState decodeAt(
       BytesRef metadataBytes, BytesRef dataBytes, BitUnpacker bitUnpacker, int startBitIndex) {
-    assert metadataBytes.length == this.metadataBytesLength;
 
-    int upto = metadataBytes.offset;
     IntBlockTermState decoded = new IntBlockTermState();
+    decodeAtWithReuse(metadataBytes, dataBytes, bitUnpacker, startBitIndex, decoded);
+
+    return decoded;
+  }
+
+  @Override
+  public void decodeAtWithReuse(
+      BytesRef metadataBytes,
+      BytesRef dataBytes,
+      BitUnpacker bitUnpacker,
+      int startBitIndex,
+      IntBlockTermState reuse) {
+    assert metadataBytes.length == this.metadataBytesLength;
+
+    reuse.lastPosBlockOffset = -1;
+    reuse.skipOffset = -1;
+    reuse.singletonDocID = -1;
 
+    int upto = metadataBytes.offset;
     for (int i = 0; i < components.length; i++) {
       var component = components[i];
       int bitWidth = metadataBytes.bytes[upto++];
@@ -225,11 +241,9 @@ public IntBlockTermState decodeAt(
         val += (long) BitUtil.VH_LE_LONG.get(metadataBytes.bytes, upto);
         upto += 8;
       }
-      component.setTargetValue(decoded, val);
+      component.setTargetValue(reuse, val);
       startBitIndex += bitWidth;
     }
-
-    return decoded;
   }
 
   private record Metadata(byte bitWidth, long referenceValue) {}

From 0aadef5d7aa0ad462df38c75655c9f725f3873cc Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sun, 26 Nov 2023 20:00:58 -0800
Subject: [PATCH 50/57] Don't create slice description when requesting
 random-access input slice

Profiling show lots of allocation to build a name for such slice
---
 lucene/core/src/java/org/apache/lucene/store/IndexInput.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java
index 3f703bc54b26..4307376cffbf 100644
--- a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java
@@ -141,7 +141,7 @@ protected String getFullSliceDescription(String sliceDescription) {
    * implements absolute reads as seek+read.
    */
   public RandomAccessInput randomAccessSlice(long offset, long length) throws IOException {
-    final IndexInput slice = slice("randomaccess", offset, length);
+    final IndexInput slice = slice(null, offset, length);
     if (slice instanceof RandomAccessInput) {
       // slice() already supports random access
       return (RandomAccessInput) slice;

From e70e712707ac342b788dc2a86b61c2e092c7320a Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sun, 26 Nov 2023 22:40:18 -0800
Subject: [PATCH 51/57] Use primitive long FST for term lookup to avoid
 allocation from boxing-unboxing

---
 .../lucene99/randomaccess/RandomAccessTermsDict.java      | 7 ++++---
 .../sandbox/codecs/lucene99/randomaccess/TermsImpl.java   | 8 ++++----
 .../lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java     | 4 ++--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
index f767c2d4ed99..da48eb1f57e1 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/RandomAccessTermsDict.java
@@ -27,7 +27,7 @@
 /** A term dictionary that offer random-access to read a specific term */
 record RandomAccessTermsDict(
     TermsStats termsStats,
-    TermsIndex termsIndex,
+    TermsIndexPrimitive termsIndex,
     TermDataReaderProvider termDataReaderProvider,
     IndexOptions indexOptions) {
 
@@ -52,9 +52,10 @@ static RandomAccessTermsDict deserialize(
     boolean hasPayloads = indexOptionsProvider.hasPayloads(termsStats.fieldNumber());
 
     // (2) deserialize terms index
-    TermsIndex termsIndex = null;
+    TermsIndexPrimitive termsIndex = null;
     if (termsStats.size() > 0) {
-      termsIndex = TermsIndex.deserialize(metaInput, termIndexInput, /* load off heap */ true);
+      termsIndex =
+          TermsIndexPrimitive.deserialize(metaInput, termIndexInput, /* load off heap */ true);
     }
 
     // (3) deserialize all the term data by each TermType
diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index d3977e4d5252..36387d47d32c 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -29,7 +29,7 @@
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.BytesRefPrimitiveLongFSTEnum;
 
 final class TermsImpl extends Terms {
   private final FieldInfo fieldInfo;
@@ -120,9 +120,9 @@ final class RandomAccessTermsEnum extends TermsEnum {
 
     private IntBlockTermState termState;
 
-    private final BytesRefFSTEnum<Long> fstEnum;
+    private final BytesRefPrimitiveLongFSTEnum fstEnum;
 
-    private BytesRefFSTEnum.InputOutput<Long> fstSeekState;
+    private BytesRefPrimitiveLongFSTEnum.InputOutput fstSeekState;
 
     // Only set when seekExact(term, state) is called, because that will update
     // the termState but leave the fstSeekState out of sync.
@@ -133,7 +133,7 @@ final class RandomAccessTermsEnum extends TermsEnum {
 
     RandomAccessTermsEnum() throws IOException {
       termState = (IntBlockTermState) lucene99PostingsReader.newTermState();
-      fstEnum = new BytesRefFSTEnum<>(termsDict.termsIndex().fst());
+      fstEnum = new BytesRefPrimitiveLongFSTEnum(termsDict.termsIndex().primitiveLongFST());
       termDataReader = termsDict.termDataReaderProvider().newReader();
     }
 
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java
index af34576b35b1..1aa5b03e7bb5 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesRefPrimitiveLongFSTEnum.java
@@ -115,11 +115,11 @@ protected void grow() {
 
   private InputOutput setResult() {
     if (upto == 0) {
-      result.output = -1;
+      return null;
     } else {
       current.length = upto - 1;
       result.output = output[upto];
+      return result;
     }
-    return result;
   }
 }

From 3d21b1a01c79fb8b17861403f2633bca96cf1047 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 27 Nov 2023 13:58:11 -0800
Subject: [PATCH 52/57] Make RAFDirectory resilient to `null` description when
 slicing

---
 .../src/java/org/apache/lucene/misc/store/RAFDirectory.java    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java
index 420d6d40d6de..21ba55fd08ab 100644
--- a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java
@@ -140,7 +140,8 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw
         throw new IllegalArgumentException(
             "slice() " + sliceDescription + " out of bounds: " + this);
       }
-      return new RAFIndexInput(sliceDescription, file, off + offset, length, getBufferSize());
+      String description = sliceDescription == null ? toString() : sliceDescription;
+      return new RAFIndexInput(description, file, off + offset, length, getBufferSize());
     }
 
     @Override

From cd60a4f4338a314f86a327d243f4b634a0c65d67 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Tue, 28 Nov 2023 00:09:49 -0800
Subject: [PATCH 53/57] Implement interesect

---
 .../lucene99/randomaccess/TermsImpl.java      | 388 +++++++++++++++++-
 .../java/org/apache/lucene/util/fst/Util.java |  86 +++-
 2 files changed, 466 insertions(+), 8 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index 36387d47d32c..9ddb4a9c8d77 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -27,9 +27,19 @@
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.ByteRunnable;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.fst.BytesRefPrimitiveLongFSTEnum;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PrimitiveLongFST;
+import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc;
+import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongFSTOutputs;
+import org.apache.lucene.util.fst.Util;
 
 final class TermsImpl extends Terms {
   private final FieldInfo fieldInfo;
@@ -104,12 +114,13 @@ public TermsEnum iterator() throws IOException {
     return new RandomAccessTermsEnum();
   }
 
-  // TODO: implement a more efficient version via FST
-  //  @Override
-  //  public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException
-  // {
-  //    return null;
-  //  }
+  @Override
+  public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+    if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+      throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
+    }
+    return new RandomAccessIntersectTermsEnum(compiled, startTerm);
+  }
 
   final class RandomAccessTermsEnum extends TermsEnum {
     private AttributeSource attrs;
@@ -245,4 +256,369 @@ public void seekExact(long ord) throws IOException {
       throw new UnsupportedOperationException("By ord lookup not supported.");
     }
   }
+
+  final class RandomAccessIntersectTermsEnum extends TermsEnum {
+    private AttributeSource attrs;
+
+    private BytesRefBuilder term;
+
+    private boolean isTermStateCurrent;
+
+    private IntBlockTermState termState;
+
+    private final PrimitiveLongFST fst;
+
+    private final FST.BytesReader fstReader;
+
+    private final ByteRunnable fsa;
+
+    private PrimitiveLongFSTOutputs fstOutputs = PrimitiveLongFSTOutputs.getSingleton();
+
+    private final TermDataReaderProvider.TermDataReader termDataReader;
+
+    private Frame[] stack;
+
+    private int level;
+
+    private boolean pending;
+
+    private final class Frame {
+      /* fst stats */
+      PrimitiveLongArc fstArc;
+      long output;
+      /* automaton stats */
+      int fsaState;
+
+      Frame() {
+        this.fstArc = new PrimitiveLongArc();
+        this.fsaState = -1;
+      }
+
+      @Override
+      public String toString() {
+        return "arc=" + fstArc + " state=" + fsaState;
+      }
+    }
+
+    /**
+     * Inspired by {@link org.apache.lucene.codecs.memory.FSTTermsReader}'s IntersectTermsEnum
+     */
+    RandomAccessIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm)
+        throws IOException {
+      termState = (IntBlockTermState) lucene99PostingsReader.newTermState();
+      fst = termsDict.termsIndex().primitiveLongFST();
+      fstReader = fst.getBytesReader();
+      fsa = compiled.getByteRunnable();
+      termDataReader = termsDict.termDataReaderProvider().newReader();
+
+      stack = new Frame[16];
+      for (int i = 0; i < stack.length; i++) {
+        this.stack[i] = new Frame();
+      }
+      loadVirtualFrame(newFrame());
+      level = 0;
+
+      pushFrame(loadFirstFrame(newFrame()));
+      if (startTerm == null) {
+        pending = isAccept(topFrame());
+      } else {
+        doSeekCeil(startTerm);
+        pending =
+            (term == null || !startTerm.equals(term.get()))
+                && isValid(topFrame())
+                && isAccept(topFrame());
+      }
+    }
+
+    @Override
+    public BytesRef next() throws IOException {
+      if (pending) {
+        pending = false;
+        updateTermStateIfNeeded();
+        return term();
+      }
+      isTermStateCurrent = false;
+      DFS:
+      while (level > 0) {
+        Frame frame = newFrame();
+        if (loadExpandFrame(topFrame(), frame) != null) { // has valid target
+          pushFrame(frame);
+          if (isAccept(frame)) { // gotcha
+            break;
+          }
+          continue; // check next target
+        }
+        frame = popFrame();
+        while (level > 0) {
+          if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling
+            pushFrame(frame);
+            if (isAccept(frame)) { // gotcha
+              break DFS;
+            }
+            continue DFS; // check next target
+          }
+          frame = popFrame();
+        }
+        return null;
+      }
+      if (term != null) {
+        updateTermStateIfNeeded();
+      }
+      return term();
+    }
+
+    private long accumulateOutput() {
+      long output = 0;
+      int upto = 0;
+      Frame last, next;
+      last = stack[1];
+      while (upto != level) {
+        upto++;
+        next = stack[upto];
+        output = fstOutputs.add(next.output, output);
+        last = next;
+      }
+      if (last.fstArc.isFinal()) {
+        output = fstOutputs.add(output, last.fstArc.nextFinalOutput());
+      }
+      return output;
+    }
+
+    private BytesRef doSeekCeil(BytesRef target) throws IOException {
+      Frame frame = null;
+      int label, upto = 0, limit = target.length;
+      while (upto < limit) { // to target prefix, or ceil label (rewind prefix)
+        frame = newFrame();
+        label = target.bytes[target.offset + upto] & 0xff;
+        frame = loadCeilFrame(label, topFrame(), frame);
+        if (frame == null || frame.fstArc.label() != label) {
+          break;
+        }
+        assert isValid(frame); // target must be fetched from automaton
+        pushFrame(frame);
+        upto++;
+      }
+      if (upto == limit) { // got target
+        return term();
+      }
+      if (frame != null) { // got larger term('s prefix)
+        pushFrame(frame);
+        return isAccept(frame) ? term() : next();
+      }
+      while (level > 0) { // got target's prefix, advance to larger term
+        frame = popFrame();
+        while (level > 0 && !canRewind(frame)) {
+          frame = popFrame();
+        }
+        if (loadNextFrame(topFrame(), frame) != null) {
+          pushFrame(frame);
+          return isAccept(frame) ? term() : next();
+        }
+      }
+      return null;
+    }
+
+    /** Load frame for target arc(node) on fst */
+    Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
+      if (!canGrow(top)) {
+        return null;
+      }
+      frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader);
+      frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label());
+      // if (TEST) System.out.println(" loadExpand frame="+frame);
+      if (frame.fsaState == -1) {
+        return loadNextFrame(top, frame);
+      }
+      frame.output = frame.fstArc.output();
+      return frame;
+    }
+
+    Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
+      PrimitiveLongArc arc = frame.fstArc;
+      arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
+      if (arc == null) {
+        return null;
+      }
+      frame.fsaState = fsa.step(top.fsaState, arc.label());
+      if (frame.fsaState == -1) {
+        return loadNextFrame(top, frame);
+      }
+      frame.output = frame.fstArc.output();
+      return frame;
+    }
+
+    /** Load frame for sibling arc(node) on fst */
+    Frame loadNextFrame(Frame top, Frame frame) throws IOException {
+      if (!canRewind(frame)) {
+        return null;
+      }
+      while (!frame.fstArc.isLast()) {
+        frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
+        frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label());
+        if (frame.fsaState != -1) {
+          break;
+        }
+      }
+      if (frame.fsaState == -1) {
+        return null;
+      }
+      frame.output = frame.fstArc.output();
+      return frame;
+    }
+
+    void updateTermStateIfNeeded() throws IOException {
+      if (!isTermStateCurrent) {
+        long fstOutput = accumulateOutput();
+        TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstOutput);
+        termState =
+            termDataReader.getTermState(
+                typeAndOrd.termType(), typeAndOrd.ord(), fieldInfo.getIndexOptions());
+        isTermStateCurrent = true;
+      }
+    }
+
+    @Override
+    public AttributeSource attributes() {
+      if (attrs == null) {
+        attrs = new AttributeSource();
+      }
+      return attrs;
+    }
+
+    @Override
+    public boolean seekExact(BytesRef text) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public BytesRef term() throws IOException {
+      return term == null ? null : term.get();
+    }
+
+    @Override
+    public int docFreq() throws IOException {
+      updateTermStateIfNeeded();
+      return termState.docFreq;
+    }
+
+    @Override
+    public long totalTermFreq() throws IOException {
+      updateTermStateIfNeeded();
+      return termState.totalTermFreq;
+    }
+
+    @Override
+    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
+      updateTermStateIfNeeded();
+      return lucene99PostingsReader.postings(fieldInfo, termState, reuse, flags);
+    }
+
+    @Override
+    public ImpactsEnum impacts(int flags) throws IOException {
+      updateTermStateIfNeeded();
+      return lucene99PostingsReader.impacts(fieldInfo, termState, flags);
+    }
+
+    @Override
+    public TermState termState() throws IOException {
+      updateTermStateIfNeeded();
+      return termState.clone();
+    }
+
+    /** Virtual frame, never pop */
+    Frame loadVirtualFrame(Frame frame) {
+      frame.output = fstOutputs.getNoOutput();
+      frame.fsaState = -1;
+      return frame;
+    }
+
+    Frame newFrame() {
+      if (level + 1 == stack.length) {
+        final Frame[] temp =
+            new Frame[ArrayUtil.oversize(level + 2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+        System.arraycopy(stack, 0, temp, 0, stack.length);
+        for (int i = stack.length; i < temp.length; i++) {
+          temp[i] = new Frame();
+        }
+        stack = temp;
+      }
+      return stack[level + 1];
+    }
+
+    Frame topFrame() {
+      return stack[level];
+    }
+
+    boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts
+      return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal();
+    }
+
+    boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject
+      return /*frame != null &&*/ frame.fsaState != -1;
+    }
+
+    boolean canGrow(Frame frame) { // can walk forward on both fst&fsa
+      return frame.fsaState != -1 && PrimitiveLongFST.targetHasArcs(frame.fstArc);
+    }
+
+    boolean canRewind(Frame frame) { // can jump to sibling
+      return !frame.fstArc.isLast();
+    }
+
+    void pushFrame(Frame frame) {
+      term = grow(frame.fstArc.label());
+      level++;
+    }
+
+    Frame popFrame() {
+      term = shrink();
+      level--;
+      return stack[level + 1];
+    }
+
+    Frame loadFirstFrame(Frame frame) {
+      frame.fstArc = fst.getFirstArc(frame.fstArc);
+      frame.output = frame.fstArc.output();
+      frame.fsaState = 0;
+      return frame;
+    }
+
+    BytesRefBuilder grow(int label) {
+      if (term == null) {
+        term = new BytesRefBuilder();
+      } else {
+        term.append((byte) label);
+      }
+      return term;
+    }
+
+    BytesRefBuilder shrink() {
+      if (term.length() == 0) {
+        term = null;
+      } else {
+        term.setLength(term.length() - 1);
+      }
+      return term;
+    }
+
+    @Override
+    public long ord() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void seekExact(long ord) throws IOException {
+      throw new UnsupportedOperationException("By ord lookup not supported.");
+    }
+
+    @Override
+    public SeekStatus seekCeil(BytesRef text) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void seekExact(BytesRef target, TermState state) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
index 740460679668..31c267234e69 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@@ -32,6 +32,7 @@
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.FST.Arc;
 import org.apache.lucene.util.fst.FST.BytesReader;
+import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc;
 
 /**
  * Static helper methods.
@@ -896,6 +897,88 @@ public static <T> Arc<T> readCeilArc(
     }
   }
 
+  /**
+   * TODO: can we work around this???
+   *
+   * <p>Same as {@link Util#readCeilArc(int, FST, Arc, Arc, BytesReader)} but for {@link
+   * PrimitiveLongFST}
+   */
+  public static PrimitiveLongArc readCeilArc(
+      int label,
+      PrimitiveLongFST fst,
+      PrimitiveLongArc follow,
+      PrimitiveLongArc arc,
+      BytesReader in)
+      throws IOException {
+    if (label == PrimitiveLongFST.END_LABEL) {
+      return PrimitiveLongFST.readEndArc(follow, arc);
+    }
+    if (!PrimitiveLongFST.targetHasArcs(follow)) {
+      return null;
+    }
+    fst.readFirstTargetArc(follow, arc, in);
+    if (arc.bytesPerArc() != 0 && arc.label() != PrimitiveLongFST.END_LABEL) {
+      if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_DIRECT_ADDRESSING) {
+        // Fixed length arcs in a direct addressing node.
+        int targetIndex = label - arc.label();
+        if (targetIndex >= arc.numArcs()) {
+          return null;
+        } else if (targetIndex < 0) {
+          return arc;
+        } else {
+          if (PrimitiveLongArc.BitTable.isBitSet(targetIndex, arc, in)) {
+            fst.readArcByDirectAddressing(arc, in, targetIndex);
+            assert arc.label() == label;
+          } else {
+            int ceilIndex = PrimitiveLongArc.BitTable.nextBitSet(targetIndex, arc, in);
+            assert ceilIndex != -1;
+            fst.readArcByDirectAddressing(arc, in, ceilIndex);
+            assert arc.label() > label;
+          }
+          return arc;
+        }
+      } else if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_CONTINUOUS) {
+        int targetIndex = label - arc.label();
+        if (targetIndex >= arc.numArcs()) {
+          return null;
+        } else if (targetIndex < 0) {
+          return arc;
+        } else {
+          fst.readArcByContinuous(arc, in, targetIndex);
+          assert arc.label() == label;
+          return arc;
+        }
+      }
+      // Fixed length arcs in a binary search node.
+      int idx = binarySearch(fst, arc, label);
+      if (idx >= 0) {
+        return fst.readArcByIndex(arc, in, idx);
+      }
+      idx = -1 - idx;
+      if (idx == arc.numArcs()) {
+        // DEAD END!
+        return null;
+      }
+      return fst.readArcByIndex(arc, in, idx);
+    }
+
+    // Variable length arcs in a linear scan list,
+    // or special arc with label == FST.END_LABEL.
+    fst.readFirstRealTargetArc(follow.target(), arc, in);
+
+    while (true) {
+      // System.out.println("  non-bs cycle");
+      if (arc.label() >= label) {
+        // System.out.println("    found!");
+        return arc;
+      } else if (arc.isLast()) {
+        return null;
+      } else {
+        fst.readNextRealArc(arc, in);
+      }
+    }
+  }
+
   /**
    * Perform a binary search of Arcs encoded as a packed array
    *
@@ -936,8 +1019,7 @@ static <T> int binarySearch(FST<T> fst, FST.Arc<T> arc, int targetLabel) throws
   }
 
   /** Same as {@link Util#binarySearch(FST, Arc, int)} but for {@link PrimitiveLongFST} */
-  static int binarySearch(
-      PrimitiveLongFST fst, PrimitiveLongFST.PrimitiveLongArc arc, int targetLabel)
+  static int binarySearch(PrimitiveLongFST fst, PrimitiveLongArc arc, int targetLabel)
       throws IOException {
     assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH
         : "Arc is not encoded as packed array for binary search (nodeFlags="

From 92392471aebd0ea14318f5ce23cde996da47e533 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Tue, 28 Nov 2023 22:15:00 -0800
Subject: [PATCH 54/57] Lazy decode termstate in IntersectEnum

---
 .../sandbox/codecs/lucene99/randomaccess/TermsImpl.java       | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index 9ddb4a9c8d77..29567d83c8af 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -334,7 +334,6 @@ && isValid(topFrame())
     public BytesRef next() throws IOException {
       if (pending) {
         pending = false;
-        updateTermStateIfNeeded();
         return term();
       }
       isTermStateCurrent = false;
@@ -361,9 +360,6 @@ public BytesRef next() throws IOException {
         }
         return null;
       }
-      if (term != null) {
-        updateTermStateIfNeeded();
-      }
       return term();
     }
 

From 05743d910ef7437b06de5bf03b85f9b8f2859c5b Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Fri, 8 Dec 2023 23:58:17 -0800
Subject: [PATCH 55/57] Minor non-functionarly change for TermsIndexBuilder

---
 .../codecs/lucene99/randomaccess/TermsIndexBuilder.java   | 2 +-
 .../lucene99/randomaccess/TestTermsIndexBuilder.java      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
index 35dd42e81cd5..68bf66a3cbec 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsIndexBuilder.java
@@ -52,7 +52,7 @@ public TermsIndex build() throws IOException {
     return new TermsIndex(fstCompiler.compile());
   }
 
-  private long encode(long ord, TermType termType) {
+  static long encode(long ord, TermType termType) {
     // use a single long to encode `ord` and `termType`
     // also consider the special value of `PositiveIntOutputs.NO_OUTPUT == 0`
     // so it looks like this |...  ord ...| termType| ... hasOutput  ...|
diff --git a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
index 9528dcd69b0d..1dad8688fc41 100644
--- a/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
+++ b/lucene/codecs/src/test/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TestTermsIndexBuilder.java
@@ -29,7 +29,7 @@
 public class TestTermsIndexBuilder extends LuceneTestCase {
 
   public void testBasics() throws IOException {
-    String[] test_terms = {
+    String[] termTerms = {
       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
     };
 
@@ -37,7 +37,7 @@ public void testBasics() throws IOException {
     Map<String, Integer> termsToOrd = new HashMap<>();
     Map<Integer, Integer> typeCounters = new HashMap<>();
 
-    for (String term : test_terms) {
+    for (String term : termTerms) {
       int termType = random().nextInt(TermType.NUM_TOTAL_TYPES);
       termsToType.put(term, termType);
       int ord = typeCounters.getOrDefault(termType, -1) + 1;
@@ -46,7 +46,7 @@ public void testBasics() throws IOException {
     }
 
     TermsIndexBuilder builder = new TermsIndexBuilder();
-    for (String term : test_terms) {
+    for (String term : termTerms) {
       BytesRef termBytes = new BytesRef(term);
       builder.addTerm(termBytes, TermType.fromId(termsToType.get(term)));
     }
@@ -63,7 +63,7 @@ public void testBasics() throws IOException {
         TermsIndexPrimitive.deserialize(
             new ByteArrayDataInput(metaBytes), new ByteArrayDataInput(dataBytes), false);
 
-    for (String term : test_terms) {
+    for (String term : termTerms) {
       BytesRef termBytes = new BytesRef(term);
       TermsIndex.TypeAndOrd typeAndOrd = termsIndexPrimitive.getTerm(termBytes);
 

From 93ed998638f2097076a36ee8ef58f08aca5c15b9 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Sat, 9 Dec 2023 00:00:37 -0800
Subject: [PATCH 56/57] implement FST + FSA intersection that leverages fast
 addressing of arc/transitions

FST nodes have differetn variant. For non-variable length encoded node we can more efficiently
lookup for a target label.

Similarly, for FSAs the TransitionAccessor allows access to a list of [min, max] ranges in order, on which
we can perform binary-search to advance to applicable transitions for a given target
---
 .../lucene99/randomaccess/TermsImpl.java      | 283 +------------
 .../util/automaton/NFARunAutomaton.java       |   1 +
 .../lucene/util/fst/PrimitiveLongFSTEnum.java |   2 -
 .../fst/PrimitiveLongFSTIntersectEnum.java    | 374 ++++++++++++++++++
 .../TestPrimitiveLongFSTIntersectEnum.java    | 309 +++++++++++++++
 5 files changed, 700 insertions(+), 269 deletions(-)
 create mode 100644 lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java
 create mode 100644 lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java

diff --git a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
index 29567d83c8af..aebcea20856c 100644
--- a/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
+++ b/lucene/codecs/src/java/org/apache/lucene/sandbox/codecs/lucene99/randomaccess/TermsImpl.java
@@ -27,19 +27,13 @@
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.automaton.ByteRunnable;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.TransitionAccessor;
 import org.apache.lucene.util.fst.BytesRefPrimitiveLongFSTEnum;
-import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PrimitiveLongFST;
-import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc;
-import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongFSTOutputs;
-import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.fst.PrimitiveLongFSTIntersectEnum;
 
 final class TermsImpl extends Terms {
   private final FieldInfo fieldInfo;
@@ -260,211 +254,42 @@ public void seekExact(long ord) throws IOException {
   final class RandomAccessIntersectTermsEnum extends TermsEnum {
     private AttributeSource attrs;
 
-    private BytesRefBuilder term;
-
     private boolean isTermStateCurrent;
 
     private IntBlockTermState termState;
 
-    private final PrimitiveLongFST fst;
-
-    private final FST.BytesReader fstReader;
-
-    private final ByteRunnable fsa;
+    private BytesRef term;
 
-    private PrimitiveLongFSTOutputs fstOutputs = PrimitiveLongFSTOutputs.getSingleton();
+    private final PrimitiveLongFST fst;
 
     private final TermDataReaderProvider.TermDataReader termDataReader;
 
-    private Frame[] stack;
-
-    private int level;
-
-    private boolean pending;
-
-    private final class Frame {
-      /* fst stats */
-      PrimitiveLongArc fstArc;
-      long output;
-      /* automaton stats */
-      int fsaState;
-
-      Frame() {
-        this.fstArc = new PrimitiveLongArc();
-        this.fsaState = -1;
-      }
+    private final PrimitiveLongFSTIntersectEnum fstFsaIntersectEnum;
 
-      @Override
-      public String toString() {
-        return "arc=" + fstArc + " state=" + fsaState;
-      }
-    }
-
-    /**
-     * Inspired by {@link org.apache.lucene.codecs.memory.FSTTermsReader}'s IntersectTermsEnum
-     */
     RandomAccessIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm)
         throws IOException {
+      TransitionAccessor transitionAccessor = compiled.getTransitionAccessor();
+      // assert transitionAccessor.getNumTransitions(0) == 1;
       termState = (IntBlockTermState) lucene99PostingsReader.newTermState();
       fst = termsDict.termsIndex().primitiveLongFST();
-      fstReader = fst.getBytesReader();
-      fsa = compiled.getByteRunnable();
       termDataReader = termsDict.termDataReaderProvider().newReader();
-
-      stack = new Frame[16];
-      for (int i = 0; i < stack.length; i++) {
-        this.stack[i] = new Frame();
-      }
-      loadVirtualFrame(newFrame());
-      level = 0;
-
-      pushFrame(loadFirstFrame(newFrame()));
-      if (startTerm == null) {
-        pending = isAccept(topFrame());
-      } else {
-        doSeekCeil(startTerm);
-        pending =
-            (term == null || !startTerm.equals(term.get()))
-                && isValid(topFrame())
-                && isAccept(topFrame());
-      }
+      fstFsaIntersectEnum = new PrimitiveLongFSTIntersectEnum(fst, compiled, startTerm);
     }
 
     @Override
     public BytesRef next() throws IOException {
-      if (pending) {
-        pending = false;
-        return term();
-      }
-      isTermStateCurrent = false;
-      DFS:
-      while (level > 0) {
-        Frame frame = newFrame();
-        if (loadExpandFrame(topFrame(), frame) != null) { // has valid target
-          pushFrame(frame);
-          if (isAccept(frame)) { // gotcha
-            break;
-          }
-          continue; // check next target
-        }
-        frame = popFrame();
-        while (level > 0) {
-          if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling
-            pushFrame(frame);
-            if (isAccept(frame)) { // gotcha
-              break DFS;
-            }
-            continue DFS; // check next target
-          }
-          frame = popFrame();
-        }
-        return null;
-      }
-      return term();
-    }
-
-    private long accumulateOutput() {
-      long output = 0;
-      int upto = 0;
-      Frame last, next;
-      last = stack[1];
-      while (upto != level) {
-        upto++;
-        next = stack[upto];
-        output = fstOutputs.add(next.output, output);
-        last = next;
-      }
-      if (last.fstArc.isFinal()) {
-        output = fstOutputs.add(output, last.fstArc.nextFinalOutput());
-      }
-      return output;
-    }
-
-    private BytesRef doSeekCeil(BytesRef target) throws IOException {
-      Frame frame = null;
-      int label, upto = 0, limit = target.length;
-      while (upto < limit) { // to target prefix, or ceil label (rewind prefix)
-        frame = newFrame();
-        label = target.bytes[target.offset + upto] & 0xff;
-        frame = loadCeilFrame(label, topFrame(), frame);
-        if (frame == null || frame.fstArc.label() != label) {
-          break;
-        }
-        assert isValid(frame); // target must be fetched from automaton
-        pushFrame(frame);
-        upto++;
-      }
-      if (upto == limit) { // got target
-        return term();
-      }
-      if (frame != null) { // got larger term('s prefix)
-        pushFrame(frame);
-        return isAccept(frame) ? term() : next();
-      }
-      while (level > 0) { // got target's prefix, advance to larger term
-        frame = popFrame();
-        while (level > 0 && !canRewind(frame)) {
-          frame = popFrame();
-        }
-        if (loadNextFrame(topFrame(), frame) != null) {
-          pushFrame(frame);
-          return isAccept(frame) ? term() : next();
-        }
-      }
-      return null;
-    }
-
-    /** Load frame for target arc(node) on fst */
-    Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
-      if (!canGrow(top)) {
-        return null;
-      }
-      frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader);
-      frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label());
-      // if (TEST) System.out.println(" loadExpand frame="+frame);
-      if (frame.fsaState == -1) {
-        return loadNextFrame(top, frame);
-      }
-      frame.output = frame.fstArc.output();
-      return frame;
-    }
-
-    Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
-      PrimitiveLongArc arc = frame.fstArc;
-      arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
-      if (arc == null) {
-        return null;
-      }
-      frame.fsaState = fsa.step(top.fsaState, arc.label());
-      if (frame.fsaState == -1) {
-        return loadNextFrame(top, frame);
-      }
-      frame.output = frame.fstArc.output();
-      return frame;
-    }
-
-    /** Load frame for sibling arc(node) on fst */
-    Frame loadNextFrame(Frame top, Frame frame) throws IOException {
-      if (!canRewind(frame)) {
-        return null;
-      }
-      while (!frame.fstArc.isLast()) {
-        frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
-        frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label());
-        if (frame.fsaState != -1) {
-          break;
-        }
-      }
-      if (frame.fsaState == -1) {
-        return null;
+      if (fstFsaIntersectEnum.next()) {
+        term = fstFsaIntersectEnum.getTerm();
+        isTermStateCurrent = false;
+      } else {
+        term = null;
       }
-      frame.output = frame.fstArc.output();
-      return frame;
+      return term;
     }
 
     void updateTermStateIfNeeded() throws IOException {
       if (!isTermStateCurrent) {
-        long fstOutput = accumulateOutput();
+        long fstOutput = fstFsaIntersectEnum.getFSTOutput();
         TermsIndex.TypeAndOrd typeAndOrd = TermsIndex.decodeLong(fstOutput);
         termState =
             termDataReader.getTermState(
@@ -488,7 +313,7 @@ public boolean seekExact(BytesRef text) throws IOException {
 
     @Override
     public BytesRef term() throws IOException {
-      return term == null ? null : term.get();
+      return term;
     }
 
     @Override
@@ -521,82 +346,6 @@ public TermState termState() throws IOException {
       return termState.clone();
     }
 
-    /** Virtual frame, never pop */
-    Frame loadVirtualFrame(Frame frame) {
-      frame.output = fstOutputs.getNoOutput();
-      frame.fsaState = -1;
-      return frame;
-    }
-
-    Frame newFrame() {
-      if (level + 1 == stack.length) {
-        final Frame[] temp =
-            new Frame[ArrayUtil.oversize(level + 2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
-        System.arraycopy(stack, 0, temp, 0, stack.length);
-        for (int i = stack.length; i < temp.length; i++) {
-          temp[i] = new Frame();
-        }
-        stack = temp;
-      }
-      return stack[level + 1];
-    }
-
-    Frame topFrame() {
-      return stack[level];
-    }
-
-    boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts
-      return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal();
-    }
-
-    boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject
-      return /*frame != null &&*/ frame.fsaState != -1;
-    }
-
-    boolean canGrow(Frame frame) { // can walk forward on both fst&fsa
-      return frame.fsaState != -1 && PrimitiveLongFST.targetHasArcs(frame.fstArc);
-    }
-
-    boolean canRewind(Frame frame) { // can jump to sibling
-      return !frame.fstArc.isLast();
-    }
-
-    void pushFrame(Frame frame) {
-      term = grow(frame.fstArc.label());
-      level++;
-    }
-
-    Frame popFrame() {
-      term = shrink();
-      level--;
-      return stack[level + 1];
-    }
-
-    Frame loadFirstFrame(Frame frame) {
-      frame.fstArc = fst.getFirstArc(frame.fstArc);
-      frame.output = frame.fstArc.output();
-      frame.fsaState = 0;
-      return frame;
-    }
-
-    BytesRefBuilder grow(int label) {
-      if (term == null) {
-        term = new BytesRefBuilder();
-      } else {
-        term.append((byte) label);
-      }
-      return term;
-    }
-
-    BytesRefBuilder shrink() {
-      if (term.length() == 0) {
-        term = null;
-      } else {
-        term.setLength(term.length() - 1);
-      }
-      return term;
-    }
-
     @Override
     public long ord() throws IOException {
       throw new UnsupportedOperationException();
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
index 6ff52baebbc5..761cf9b77035 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
@@ -228,6 +228,7 @@ public void getTransition(int state, int index, Transition t) {
     } else {
       t.max = points[t.transitionUpto + 1] - 1;
     }
+    t.dest = dStates[t.source].transitions[t.transitionUpto];
   }
 
   private class DState {
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java
index 85c0815f964a..b2fa07b23617 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTEnum.java
@@ -689,7 +689,6 @@ private void incr() {
     upto++;
     grow();
     if (arcs.length <= upto) {
-      @SuppressWarnings({"rawtypes", "unchecked"})
       final PrimitiveLongArc[] newArcs =
           new PrimitiveLongArc
               [ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
@@ -697,7 +696,6 @@ private void incr() {
       arcs = newArcs;
     }
     if (output.length <= upto) {
-      @SuppressWarnings({"rawtypes", "unchecked"})
       final long[] newOutput =
           new long[ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
       System.arraycopy(output, 0, newOutput, 0, output.length);
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java
new file mode 100644
index 000000000000..fb4bf16775fd
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.io.IOException;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.automaton.ByteRunnable;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.TransitionAccessor;
+import org.apache.lucene.util.fst.PrimitiveLongFST.PrimitiveLongArc;
+
+/**
+ * Can next() through the terms defined by the intersection of a {@link PrimitiveLongFST}
+ *
+ * <p>and {@link org.apache.lucene.util.automaton.CompiledAutomaton}.
+ *
+ * <p><b>Note: this can only seek forward.</b>
+ *
+ * @lucene.experimental
+ */
+public final class PrimitiveLongFSTIntersectEnum {
+
+  private final PrimitiveLongFST fst;
+
+  private final FST.BytesReader fstBytesReader;
+
+  private final ByteRunnable byteRunnable;
+
+  private final TransitionAccessor transitionAccessor;
+
+  /** DFS traversal states */
+  private int currentLevel;
+
+  private Frame[] stack;
+
+  private BytesRefBuilder term = new BytesRefBuilder();
+
+  private long fstOutput;
+
+  boolean pending;
+
+  boolean isEmptyValidOutput;
+
+  public PrimitiveLongFSTIntersectEnum(
+      PrimitiveLongFST fst, CompiledAutomaton automaton, BytesRef startTerm) throws IOException {
+    this.fst = fst;
+    this.fstBytesReader = fst.getBytesReader();
+    this.byteRunnable = automaton.getByteRunnable();
+    this.transitionAccessor = automaton.getTransitionAccessor();
+    this.stack = new Frame[16];
+
+    var firstFrame = new Frame();
+    firstFrame.fstNode = new PrimitiveLongArc();
+    fst.getFirstArc(firstFrame.fstNode);
+    firstFrame.fsaState = 0;
+    stack[0] = firstFrame;
+
+    if (startTerm != null) {
+      seekToStartTerm(startTerm);
+    } else {
+      isEmptyValidOutput = isAccept(firstFrame.fstNode, firstFrame.fsaState);
+    }
+  }
+
+  public boolean next() throws IOException {
+    if (isEmptyValidOutput) {
+      fstOutput = fst.getEmptyOutput();
+      isEmptyValidOutput = false;
+      return true;
+    }
+    while (currentLevel >= 0) {
+      Frame currentFrame = stack[currentLevel];
+
+      if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState)
+          || currentFrame.fstCandidateNode != null) {
+        // current frame has candidates
+        if (findNextIntersection(currentFrame)) {
+          term.grow(currentLevel + 1);
+          term.setByteAt(currentLevel, (byte) currentFrame.fstCandidateNode.label());
+          term.setLength(currentLevel + 1);
+          // early prune - only push a new frame when the candidate has descendants
+          if (hasDescendants(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) {
+            Frame nextFrame = new Frame();
+            nextFrame.fstNode = currentFrame.fstCandidateNode;
+            nextFrame.fsaState = currentFrame.fsaTransition.dest;
+            nextFrame.output = currentFrame.output + currentFrame.fstNode.output();
+            ensureStackCapacity();
+            stack[++currentLevel] = nextFrame;
+          }
+          // setup output
+          if (isAccept(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) {
+            fstOutput =
+                currentFrame.output // output before this node
+                    + currentFrame.fstNode.output() // output of this node
+                    // then output of the candidate
+                    + currentFrame.fstCandidateNode.output()
+                    + currentFrame.fstCandidateNode.nextFinalOutput();
+            return true;
+          }
+        } else {
+          // no more intersection at this frame, pop frame
+          popFrame();
+        }
+      } else {
+        // pop frame as the frame has no candidates
+        popFrame();
+      }
+    }
+    return false;
+  }
+
+  private void ensureStackCapacity() {
+    stack = ArrayUtil.grow(stack, currentLevel + 2);
+  }
+
+  private void seekToStartTerm(BytesRef startTerm) throws IOException {
+    int length = startTerm.length;
+
+    while (currentLevel < length) {
+      Frame currentFrame = stack[currentLevel];
+      int target = startTerm.bytes[startTerm.offset + currentLevel] & 0xff;
+
+      if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) {
+        initArcAndTransition(currentFrame, false);
+        fstAdvanceCeil(target, currentFrame.fstCandidateNode);
+        fsaAdvanceCeil(currentFrame, target);
+
+        if (currentFrame.fstCandidateNode.label() == target
+            && (currentFrame.fsaTransition.min <= target
+                && target <= currentFrame.fsaTransition.max)) {
+          term.append((byte) target);
+          Frame nextFrame = new Frame();
+          nextFrame.fstNode = currentFrame.fstCandidateNode;
+          nextFrame.fsaState = currentFrame.fsaTransition.dest;
+          nextFrame.output = currentFrame.output + currentFrame.fstNode.output();
+          ensureStackCapacity();
+          stack[++currentLevel] = nextFrame;
+          continue;
+        }
+
+        if (currentFrame.fstCandidateNode.label() > target
+            || currentFrame.fsaTransition.min > target) {
+          pending = true;
+        }
+        break;
+      } else {
+        // all prefix upto this level is match, but the term to seek is longer
+        break;
+      }
+    }
+  }
+
+  private void popFrame() {
+    currentLevel--;
+    term.setLength(currentLevel);
+  }
+
+  private boolean isAccept(PrimitiveLongArc fstNode, int fsaState) {
+    return byteRunnable.isAccept(fsaState) && fstNode.isFinal();
+  }
+
+  private boolean hasDescendants(PrimitiveLongArc fstNode, int fsaState) {
+    return transitionAccessor.getNumTransitions(fsaState) > 0
+        && PrimitiveLongFST.targetHasArcs(fstNode);
+  }
+
+  private void initArcAndTransition(Frame frame, boolean advanceToFirstTransition)
+      throws IOException {
+    frame.fstCandidateNode = new PrimitiveLongArc();
+    fst.readFirstRealTargetArc(frame.fstNode.target(), frame.fstCandidateNode, fstBytesReader);
+
+    frame.fsaTransition = new Transition();
+    frame.numTransitions = transitionAccessor.initTransition(frame.fsaState, frame.fsaTransition);
+    if (advanceToFirstTransition) {
+      transitionAccessor.getNextTransition(frame.fsaTransition);
+      frame.transitionUpto++;
+    }
+  }
+
+  private boolean findNextIntersection(Frame frame) throws IOException {
+    if (frame.fstCandidateNode == null) {
+      // when called first time, init first FST arc and the FSA transition
+      initArcAndTransition(frame, true);
+    } else if (pending) {
+      pending = false;
+    } else {
+      // subsequent call, which implies we previously found an intersection.
+      // we need to advance the FST to avoid returning the same state.
+      // Advance FST not the FSA because FST arc has a single label,
+      // where FSA transition may accept a range of lables
+      if (frame.fstCandidateNode.isLast()) {
+        return false;
+      }
+      frame.fstCandidateNode = fst.readNextRealArc(frame.fstCandidateNode, fstBytesReader);
+    }
+
+    while (true) {
+      if (frame.fstCandidateNode.label() < frame.fsaTransition.min) {
+        // advance FST
+        if (frame.fstCandidateNode.isLast()) {
+          // no more eligible FST arc at this level
+          return false;
+        }
+        // TODO: advance to first arc that has label >= fsaTransition.min
+        //        frame.fstCandidateNode =
+        //                fst.readNextRealArc(frame.fstCandidateNode, fstBytesReader);
+        if (fstAdvanceCeil(frame.fsaTransition.min, frame.fstCandidateNode) == false) {
+          return false;
+        }
+      } else if (frame.fstCandidateNode.label() > frame.fsaTransition.max) {
+        // advance FSA
+        if (frame.transitionUpto == frame.numTransitions) {
+          // no more eligible FSA transitions at this level
+          return false;
+        }
+        // TODO: advance FSA with binary search to fstNode.label()
+        //        transitionAccessor.getNextTransition(frame.fsaTransition);
+        //        frame.transitionUpto++;
+        fsaAdvanceCeil(frame, frame.fstCandidateNode.label());
+      } else {
+        // can go deeper
+        return true;
+      }
+    }
+  }
+
+  public BytesRef getTerm() {
+    return term.get();
+  }
+
+  public long getFSTOutput() {
+    return fstOutput;
+  }
+
+  /**
+   * Advance to the arc whose label is greater or equal to the provided target.
+   *
+   * @return true, if found.
+   */
+  private boolean fstAdvanceCeil(int target, PrimitiveLongArc /* mutates */ arc)
+      throws IOException {
+    if (arc.bytesPerArc() != 0 && arc.label() != PrimitiveLongFST.END_LABEL) {
+      if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_CONTINUOUS) {
+        int targetIndex = target - arc.label() + arc.arcIdx();
+        if (targetIndex < 0) {
+          return false;
+        } else if (targetIndex >= arc.numArcs()) {
+          fst.readArcByContinuous(arc, fstBytesReader, arc.numArcs() - 1);
+          return false;
+        } else {
+          fst.readArcByContinuous(arc, fstBytesReader, targetIndex);
+          return true;
+        }
+      } else if (arc.nodeFlags() == PrimitiveLongFST.ARCS_FOR_DIRECT_ADDRESSING) {
+        // Fixed length arcs in a direct addressing node.
+        int targetIndex = target - arc.label() + arc.arcIdx();
+        if (targetIndex >= arc.numArcs() || targetIndex < 0) {
+          return false;
+        } else if (targetIndex >= arc.numArcs()) {
+          fst.readArcByDirectAddressing(arc, fstBytesReader, arc.numArcs() - 1);
+          return false;
+        } else {
+          if (PrimitiveLongArc.BitTable.isBitSet(targetIndex, arc, fstBytesReader)) {
+            fst.readArcByDirectAddressing(arc, fstBytesReader, targetIndex);
+          } else {
+            int ceilIndex = PrimitiveLongArc.BitTable.nextBitSet(targetIndex, arc, fstBytesReader);
+            if (ceilIndex == -1) {
+              return false;
+            }
+            fst.readArcByDirectAddressing(arc, fstBytesReader, ceilIndex);
+          }
+          return true;
+        }
+      }
+      // Fixed length arcs in a binary search node.
+      int idx = Util.binarySearch(fst, arc, target);
+      if (idx >= 0) {
+        fst.readArcByIndex(arc, fstBytesReader, idx);
+        return true;
+      }
+      idx = -1 - idx;
+      if (idx == arc.numArcs()) {
+        fst.readArcByIndex(arc, fstBytesReader, arc.numArcs() - 1);
+        // DEAD END!
+        return false;
+      }
+      fst.readArcByIndex(arc, fstBytesReader, idx);
+      return true;
+    }
+
+    // Variable length arcs in a linear scan list,
+    // or special arc with label == FST.END_LABEL.
+    while (true) {
+      if (arc.label() >= target) {
+        return true;
+      } else if (arc.isLast()) {
+        return false;
+      } else {
+        fst.readNextRealArc(arc, fstBytesReader);
+      }
+    }
+  }
+
+  private void fsaAdvanceCeil(Frame frame, int target) {
+    int low = frame.transitionUpto;
+    int high = frame.numTransitions;
+    Transition t = frame.fsaTransition;
+
+    // invariant: target is between the min of [low, high)
+    int mid = 0;
+    while (high - low > 1) {
+      mid = (high + low) >>> 1;
+      transitionAccessor.getTransition(frame.fsaState, mid, t);
+      if (t.min > target) {
+        high = mid;
+      } else if (t.min < target) {
+        low = mid;
+      } else {
+        frame.transitionUpto = mid + 1;
+        return;
+      }
+    }
+    transitionAccessor.getTransition(frame.fsaState, low, t);
+    frame.transitionUpto = low + 1;
+  }
+
+  private boolean fsaAdvanceCeilSlow(Frame frame, int target) {
+    while (frame.transitionUpto < frame.numTransitions) {
+      transitionAccessor.getNextTransition(frame.fsaTransition);
+      frame.transitionUpto++;
+      if (target <= frame.fsaTransition.max) {
+        return frame.fsaTransition.min <= target;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * We will maintain the state of conventional recursive DFS traversal algorithm, which is stack of
+   * frames. This class capture the state at each level.
+   */
+  static final class Frame {
+    PrimitiveLongArc fstNode;
+
+    PrimitiveLongArc fstCandidateNode;
+
+    int fsaState;
+
+    long output;
+
+    Transition fsaTransition;
+
+    int transitionUpto;
+
+    int numTransitions;
+  }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java
new file mode 100644
index 000000000000..a07e7bfae5e6
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestPrimitiveLongFSTIntersectEnum.java
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.fst;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunnable;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
+import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.TransitionAccessor;
+
+public class TestPrimitiveLongFSTIntersectEnum extends LuceneTestCase {
+
+  public void testBasics() throws IOException {
+    String[] testTerms = {
+      "!", "*", "+", "++", "+++b", "++c", "a", "b", "bb", "dd",
+    };
+
+    HashMap<String, Long> termOutputs = new HashMap<>();
+
+    IntsRefBuilder scratchInts = new IntsRefBuilder();
+    FSTCompiler<Long> fstCompiler =
+        new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton()).build();
+
+    for (var term : testTerms) {
+      long output = random().nextLong(1, 1024);
+      termOutputs.put(term, output);
+      fstCompiler.add(Util.toIntsRef(new BytesRef(term), scratchInts), output);
+      //      System.out.println(term + ": " + output);
+    }
+
+    var boxedFst = fstCompiler.compile();
+
+    byte[] metaBytes = new byte[4096];
+    byte[] dataBytes = new byte[4096];
+    DataOutput metaOut = new ByteArrayDataOutput(metaBytes);
+    DataOutput dataOutput = new ByteArrayDataOutput(dataBytes);
+
+    boxedFst.save(metaOut, dataOutput);
+
+    PrimitiveLongFST primitiveLongFst =
+        new PrimitiveLongFST(
+            PrimitiveLongFST.readMetadata(
+                new ByteArrayDataInput(metaBytes),
+                PrimitiveLongFST.PrimitiveLongFSTOutputs.getSingleton()),
+            new ByteArrayDataInput(dataBytes));
+
+    //    RegExp regExp = new RegExp("a([a-f]|[j-z])c", RegExp.NONE);
+    RegExp regExp = new RegExp("+*.", RegExp.NONE);
+    Automaton a = regExp.toAutomaton();
+    CompiledAutomaton compiledAutomaton = new CompiledAutomaton(a);
+
+    var byteRunnable = compiledAutomaton.getByteRunnable();
+    var transitionAccessor = compiledAutomaton.getTransitionAccessor();
+    //    dfsAutomaton(byteRunnable, transitionAccessor, 0, "");
+
+    PrimitiveLongFST.PrimitiveLongArc firstArc = new PrimitiveLongFST.PrimitiveLongArc();
+    System.out.println("---- recursive algo ----");
+    dfsIntersectFsaFst(
+        primitiveLongFst,
+        primitiveLongFst.getBytesReader(),
+        primitiveLongFst.getFirstArc(firstArc),
+        "",
+        0,
+        byteRunnable,
+        transitionAccessor,
+        0);
+
+    System.out.println("---- non-recursive algo ----");
+    var intersectEnum =
+        new PrimitiveLongFSTIntersectEnum(primitiveLongFst, compiledAutomaton, null);
+    while (intersectEnum.next()) {
+      String term = intersectEnum.getTerm().utf8ToString();
+      long actualOutput = intersectEnum.getFSTOutput();
+      System.out.println(
+          term + " expected output:" + termOutputs.get(term) + " actual: " + actualOutput);
+    }
+  }
+
+  void dfs(
+      PrimitiveLongFST fst,
+      FST.BytesReader in,
+      PrimitiveLongFST.PrimitiveLongArc currentLevelNode,
+      String path,
+      long acc)
+      throws IOException {
+    if (currentLevelNode.isFinal()) {
+      long output = acc + currentLevelNode.output() + currentLevelNode.nextFinalOutput();
+      System.out.println(path + (char) currentLevelNode.label() + "raw output: " + output);
+    }
+
+    if (PrimitiveLongFST.targetHasArcs(currentLevelNode)) {
+      String pathNext =
+          currentLevelNode.label() > 0 ? path + (char) currentLevelNode.label() : path;
+      long accNext = currentLevelNode.label() > 0 ? acc + currentLevelNode.output() : acc;
+      var nextLevelNode = new PrimitiveLongFST.PrimitiveLongArc();
+      fst.readFirstRealTargetArc(currentLevelNode.target(), nextLevelNode, in);
+      dfs(fst, in, nextLevelNode, pathNext, accNext);
+    }
+
+    if (currentLevelNode.isLast() == false) {
+      fst.readNextRealArc(currentLevelNode, in);
+      dfs(fst, in, currentLevelNode, path, acc);
+    }
+  }
+
+  public void testAutomaton() {
+    RegExp regExp = new RegExp("+*.", RegExp.NONE);
+    Automaton a = regExp.toAutomaton();
+    CompiledAutomaton compiledAutomaton = new CompiledAutomaton(a);
+    System.out.println("isFinite: " + compiledAutomaton.finite);
+
+    var byteRunnable = compiledAutomaton.getByteRunnable();
+    var transitionAccessor = compiledAutomaton.getTransitionAccessor();
+    // dfsAutomaton(byteRunnable, transitionAccessor, 0, "");
+    //     dumpTransitionsViaNext(byteRunnable, transitionAccessor, 0, new HashSet<>());
+    dumpTransitionsViaRA(byteRunnable, transitionAccessor, 0, new HashSet<>());
+  }
+
+  void dfsAutomaton(
+      ByteRunnable a, TransitionAccessor transitionAccessor, int currentLevelState, String path) {
+    if (a.isAccept(currentLevelState)) {
+      if (path.length() > 50) {
+        throw new RuntimeException();
+      }
+      System.out.println("found: " + path);
+    }
+
+    int currentLevelSize = transitionAccessor.getNumTransitions(currentLevelState);
+    for (int i = 0; i < currentLevelSize; i++) {
+      Transition t = new Transition();
+      transitionAccessor.getNextTransition(t);
+      System.out.println(
+          "At: src: "
+              + t.source
+              + " ["
+              + t.min
+              + ", "
+              + t.max
+              + "] "
+              + "dest: "
+              + t.dest
+              + " is dest accept: "
+              + (a.isAccept(t.dest) ? "yes" : "no"));
+      for (int label = t.min; label <= t.max; label++) {
+        dfsAutomaton(a, transitionAccessor, t.dest, path + " " + label);
+      }
+    }
+  }
+
+  void dumpTransitionsViaNext(
+      ByteRunnable a,
+      TransitionAccessor transitionAccessor,
+      int currentState,
+      Set<Integer> seenStates) {
+    if (seenStates.contains(currentState)) {
+      return;
+    }
+
+    seenStates.add(currentState);
+
+    var t = new Transition();
+    var numStates = transitionAccessor.initTransition(currentState, t);
+
+    for (int i = 0; i < numStates; i++) {
+      transitionAccessor.getNextTransition(t);
+      System.out.println(
+          "At: src: "
+              + t.source
+              + " arcIdx: "
+              + i
+              + "  ["
+              + t.min
+              + ", "
+              + t.max
+              + "] "
+              + "dest: "
+              + t.dest
+              + " is dest accept: "
+              + (a.isAccept(t.dest) ? "yes" : "no"));
+      dumpTransitionsViaNext(a, transitionAccessor, t.dest, seenStates);
+    }
+  }
+
+  void dumpTransitionsViaRA(
+      ByteRunnable a,
+      TransitionAccessor transitionAccessor,
+      int currentState,
+      Set<Integer> seenStates) {
+    if (seenStates.contains(currentState)) {
+      return;
+    }
+
+    seenStates.add(currentState);
+
+    var t = new Transition();
+    var numStates = transitionAccessor.initTransition(currentState, t);
+
+    // transitionAccessor.getTransition(currentState, numStates - 1, t);
+    for (int i = 0; i < numStates; i++) {
+      transitionAccessor.getTransition(currentState, i, t);
+      System.out.println(
+          "At: src: "
+              + t.source
+              + " arcIdx: "
+              + i
+              + "  ["
+              + t.min
+              + ", "
+              + t.max
+              + "] "
+              + "dest: "
+              + t.dest
+              + " is dest accept: "
+              + (a.isAccept(t.dest) ? "yes" : "no"));
+      dumpTransitionsViaRA(a, transitionAccessor, t.dest, seenStates);
+    }
+  }
+
+  void dfsIntersectFsaFst(
+      PrimitiveLongFST fst,
+      FST.BytesReader in,
+      PrimitiveLongFST.PrimitiveLongArc fstNode,
+      String path,
+      long acc,
+      ByteRunnable a,
+      TransitionAccessor transitionAccessor,
+      int fsaState)
+      throws IOException {
+
+    if (a.isAccept(fsaState) && fstNode.isFinal()) {
+      // found
+      System.out.println(path + ": " + (acc + fstNode.output() + fstNode.nextFinalOutput()));
+    }
+
+    Transition fsaTransition = new Transition();
+    int numTransitions = transitionAccessor.initTransition(fsaState, fsaTransition);
+
+    if (numTransitions <= 0 || !PrimitiveLongFST.targetHasArcs(fstNode)) {
+      return;
+    }
+
+    int transitionUpto = 0;
+    var nextLevelFstNode = new PrimitiveLongFST.PrimitiveLongArc();
+    fst.readFirstRealTargetArc(fstNode.target(), nextLevelFstNode, in);
+    transitionAccessor.getNextTransition(fsaTransition);
+    transitionUpto++;
+
+    while (true) {
+      if (nextLevelFstNode.label() < fsaTransition.min) {
+        // advance FST
+        if (nextLevelFstNode.isLast()) {
+          // no more eligible FST arc at this level
+          break;
+        }
+        // TODO: advance to first arc that has label >= fsaTransition.min
+        nextLevelFstNode = fst.readNextRealArc(nextLevelFstNode, in);
+      } else if (nextLevelFstNode.label() > fsaTransition.max) {
+        // advance FSA
+        if (transitionUpto == numTransitions) {
+          // no more eligible FSA transitions at this level
+          return;
+        }
+        // TODO: advance FSA with binary search to fstNode.label()
+        transitionAccessor.getNextTransition(fsaTransition);
+        transitionUpto++;
+      } else {
+        // can go deeper
+        String pathNext = path + (char) nextLevelFstNode.label();
+        long accNext = acc + fstNode.output();
+        int nextFsaState = fsaTransition.dest;
+        dfsIntersectFsaFst(
+            fst, in, nextLevelFstNode, pathNext, accNext, a, transitionAccessor, nextFsaState);
+        if (nextLevelFstNode.isLast()) {
+          // no more candidate at this prefix
+          return;
+        } else {
+          // TODO: advance to first arc that has label >= fsaTransition.min
+          nextLevelFstNode = fst.readNextRealArc(nextLevelFstNode, in);
+        }
+      }
+    }
+  }
+}

From c7e1568f791a375ffa1dfbccedddf16e53313185 Mon Sep 17 00:00:00 2001
From: Tony Xu <tonyx@amazon.com>
Date: Mon, 11 Dec 2023 17:04:30 -0800
Subject: [PATCH 57/57] Reuse stack frames to avoid allocating too many Arc and
 Transitions

---
 .../fst/PrimitiveLongFSTIntersectEnum.java    | 51 +++++++++++--------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java
index fb4bf16775fd..d7ca07581446 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/PrimitiveLongFSTIntersectEnum.java
@@ -88,8 +88,7 @@ public boolean next() throws IOException {
     while (currentLevel >= 0) {
       Frame currentFrame = stack[currentLevel];
 
-      if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState)
-          || currentFrame.fstCandidateNode != null) {
+      if (!currentFrame.isFresh || hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) {
         // current frame has candidates
         if (findNextIntersection(currentFrame)) {
           term.grow(currentLevel + 1);
@@ -97,12 +96,7 @@ public boolean next() throws IOException {
           term.setLength(currentLevel + 1);
           // early prune - only push a new frame when the candidate has descendants
           if (hasDescendants(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) {
-            Frame nextFrame = new Frame();
-            nextFrame.fstNode = currentFrame.fstCandidateNode;
-            nextFrame.fsaState = currentFrame.fsaTransition.dest;
-            nextFrame.output = currentFrame.output + currentFrame.fstNode.output();
-            ensureStackCapacity();
-            stack[++currentLevel] = nextFrame;
+            fillNextFrame(currentFrame);
           }
           // setup output
           if (isAccept(currentFrame.fstCandidateNode, currentFrame.fsaTransition.dest)) {
@@ -137,8 +131,10 @@ private void seekToStartTerm(BytesRef startTerm) throws IOException {
       Frame currentFrame = stack[currentLevel];
       int target = startTerm.bytes[startTerm.offset + currentLevel] & 0xff;
 
-      if (hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) {
+      if (currentFrame.numTransitions > 0
+          || hasDescendants(currentFrame.fstNode, currentFrame.fsaState)) {
         initArcAndTransition(currentFrame, false);
+        currentFrame.isFresh = false;
         fstAdvanceCeil(target, currentFrame.fstCandidateNode);
         fsaAdvanceCeil(currentFrame, target);
 
@@ -146,12 +142,7 @@ private void seekToStartTerm(BytesRef startTerm) throws IOException {
             && (currentFrame.fsaTransition.min <= target
                 && target <= currentFrame.fsaTransition.max)) {
           term.append((byte) target);
-          Frame nextFrame = new Frame();
-          nextFrame.fstNode = currentFrame.fstCandidateNode;
-          nextFrame.fsaState = currentFrame.fsaTransition.dest;
-          nextFrame.output = currentFrame.output + currentFrame.fstNode.output();
-          ensureStackCapacity();
-          stack[++currentLevel] = nextFrame;
+          fillNextFrame(currentFrame);
           continue;
         }
 
@@ -167,6 +158,23 @@ private void seekToStartTerm(BytesRef startTerm) throws IOException {
     }
   }
 
+  private void fillNextFrame(Frame currentFrame) {
+    ensureStackCapacity();
+    Frame nextFrame;
+    // reuse previous allocations
+    if (stack[currentLevel + 1] == null) {
+      nextFrame = new Frame();
+    } else {
+      nextFrame = stack[currentLevel + 1];
+      nextFrame.numTransitions = 0;
+      nextFrame.isFresh = true;
+    }
+    nextFrame.fstNode = currentFrame.fstCandidateNode;
+    nextFrame.fsaState = currentFrame.fsaTransition.dest;
+    nextFrame.output = currentFrame.output + currentFrame.fstNode.output();
+    stack[++currentLevel] = nextFrame;
+  }
+
   private void popFrame() {
     currentLevel--;
     term.setLength(currentLevel);
@@ -183,11 +191,9 @@ private boolean hasDescendants(PrimitiveLongArc fstNode, int fsaState) {
 
   private void initArcAndTransition(Frame frame, boolean advanceToFirstTransition)
       throws IOException {
-    frame.fstCandidateNode = new PrimitiveLongArc();
     fst.readFirstRealTargetArc(frame.fstNode.target(), frame.fstCandidateNode, fstBytesReader);
-
-    frame.fsaTransition = new Transition();
     frame.numTransitions = transitionAccessor.initTransition(frame.fsaState, frame.fsaTransition);
+    frame.transitionUpto = 0;
     if (advanceToFirstTransition) {
       transitionAccessor.getNextTransition(frame.fsaTransition);
       frame.transitionUpto++;
@@ -195,9 +201,10 @@ private void initArcAndTransition(Frame frame, boolean advanceToFirstTransition)
   }
 
   private boolean findNextIntersection(Frame frame) throws IOException {
-    if (frame.fstCandidateNode == null) {
+    if (frame.isFresh) {
       // when called first time, init first FST arc and the FSA transition
       initArcAndTransition(frame, true);
+      frame.isFresh = false;
     } else if (pending) {
       pending = false;
     } else {
@@ -359,16 +366,18 @@ private boolean fsaAdvanceCeilSlow(Frame frame, int target) {
   static final class Frame {
     PrimitiveLongArc fstNode;
 
-    PrimitiveLongArc fstCandidateNode;
+    PrimitiveLongArc fstCandidateNode = new PrimitiveLongArc();
 
     int fsaState;
 
     long output;
 
-    Transition fsaTransition;
+    Transition fsaTransition = new Transition();
 
     int transitionUpto;
 
     int numTransitions;
+
+    boolean isFresh = true;
   }
 }