apache · msfroh · Jan 29, 2024 · Jan 30, 2024 · Feb 12, 2024 · Feb 26, 2024
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
@@ -448,7 +448,8 @@ private void parse() throws IOException {
   }
 
   // Interleaves all output tokens onto the futureOutputs:
-  private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
+  private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset)
+      throws IOException {
     bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
 
     final int code = bytesReader.readVInt();

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java
@@ -439,7 +439,7 @@ private boolean parse() throws IOException {
    * Expands the output graph into the necessary tokens, adding synonyms as side paths parallel to
    * the input tokens, and buffers them in the output token buffer.
    */
-  private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
+  private void bufferOutputTokens(BytesRef bytes, int matchInputLength) throws IOException {
     bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
 
     final int code = bytesReader.readVInt();

diff --git a/...nalysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterFactory.java b/...nalysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterFactory.java
@@ -22,6 +22,7 @@
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
 import java.text.ParseException;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -85,6 +86,7 @@ public class SynonymGraphFilterFactory extends TokenFilterFactory implements Res
   private final boolean expand;
   private final String analyzerName;
   private final Map<String, String> tokArgs = new HashMap<>();
+  private final Path compiledSynonymsPath;
 
   private SynonymMap map;
 
@@ -113,6 +115,9 @@ public SynonymGraphFilterFactory(Map<String, String> args) {
         itr.remove();
       }
     }
+    String compiledSynonymsPathArg = get(args, "compiledSynonymsPath");
+    compiledSynonymsPath =
+        compiledSynonymsPathArg == null ? null : Path.of(compiledSynonymsPathArg);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -168,29 +173,36 @@ protected TokenStreamComponents createComponents(String fieldName) {
   protected SynonymMap loadSynonyms(
       ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer)
       throws IOException, ParseException {
-    CharsetDecoder decoder =
-        StandardCharsets.UTF_8
-            .newDecoder()
-            .onMalformedInput(CodingErrorAction.REPORT)
-            .onUnmappableCharacter(CodingErrorAction.REPORT);
-
-    SynonymMap.Parser parser;
-    Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
-    try {
-      parser =
-          clazz
-              .getConstructor(boolean.class, boolean.class, Analyzer.class)
-              .newInstance(dedup, expand, analyzer);
-    } catch (Exception e) {
-      throw new RuntimeException(e);
+    SynonymMapDirectory compiledSynonymsDirectory = null;
+    if (compiledSynonymsPath != null) {
+      compiledSynonymsDirectory = new SynonymMapDirectory(compiledSynonymsPath);
     }
+    if (compiledSynonymsDirectory == null || compiledSynonymsDirectory.hasSynonyms() == false) {
+      CharsetDecoder decoder =
+          StandardCharsets.UTF_8
+              .newDecoder()
+              .onMalformedInput(CodingErrorAction.REPORT)
+              .onUnmappableCharacter(CodingErrorAction.REPORT);
+
+      SynonymMap.Parser parser;
+      Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
+      try {
+        parser =
+            clazz
+                .getConstructor(boolean.class, boolean.class, Analyzer.class)
+                .newInstance(dedup, expand, analyzer);
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
 
-    List<String> files = splitFileNames(synonyms);
-    for (String file : files) {
-      decoder.reset();
-      parser.parse(new InputStreamReader(loader.openResource(file), decoder));
+      List<String> files = splitFileNames(synonyms);
+      for (String file : files) {
+        decoder.reset();
+        parser.parse(new InputStreamReader(loader.openResource(file), decoder));
+      }
+      return parser.build(compiledSynonymsDirectory);
     }
-    return parser.build();
+    return compiledSynonymsDirectory.readMap();
   }
 
   // (there are no tests for this functionality)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.internal.hppc.IntArrayList;
 import org.apache.lucene.internal.hppc.IntHashSet;
 import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefHash;
@@ -53,12 +54,24 @@ public class SynonymMap {
   public final FST<BytesRef> fst;
 
   /** map&lt;ord, outputword&gt; */
-  public final BytesRefHash words;
+  public final SynonymDictionary words;
 
   /** maxHorizontalContext: maximum context we need on the tokenstream */
   public final int maxHorizontalContext;
 
   public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) {
+    this(
+        fst,
+        new SynonymDictionary() {
+          @Override
+          public void get(int id, BytesRef scratch) {
+            words.get(id, scratch);
+          }
+        },
+        maxHorizontalContext);
+  }
+
+  SynonymMap(FST<BytesRef> fst, SynonymDictionary words, int maxHorizontalContext) {
     this.fst = fst;
     this.words = words;
     this.maxHorizontalContext = maxHorizontalContext;
@@ -218,12 +231,26 @@ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
       add(input, countWords(input), output, countWords(output), includeOrig);
     }
 
-    /** Builds an {@link SynonymMap} and returns it. */
+    /** Buils a {@link SynonymMap} and returns it. */
     public SynonymMap build() throws IOException {
+      return build(null);
+    }
+
+    /**
+     * Builds a {@link SynonymMap} and returns it. If directory is non-null, it will write the
+     * compiled SynonymMap to disk and return an off-heap version.
+     */
+    public SynonymMap build(SynonymMapDirectory directory) throws IOException {
       ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
       // TODO: are we using the best sharing options?
-      FSTCompiler<BytesRef> fstCompiler =
-          new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
+      FSTCompiler.Builder<BytesRef> fstCompilerBuilder =
+          new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
+      IndexOutput fstOutput = null;
+      if (directory != null) {
+        fstOutput = directory.fstOutput();
+        fstCompilerBuilder.dataOutput(fstOutput);
+      }
+      FSTCompiler<BytesRef> fstCompiler = fstCompilerBuilder.build();
 
       BytesRefBuilder scratch = new BytesRefBuilder();
       ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
@@ -290,11 +317,28 @@ public SynonymMap build() throws IOException {
         fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
       }
 
-      FST<BytesRef> fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
+      FST.FSTMetadata<BytesRef> fstMetaData = fstCompiler.compile();
+      if (directory != null) {
+        fstOutput.close(); // TODO -- Should fstCompiler.compile take care of this?
+        try (SynonymMapDirectory.WordsOutput wordsOutput = directory.wordsOutput()) {
+          BytesRef scratchRef = new BytesRef();
+          for (int i = 0; i < words.size(); i++) {
+            words.get(i, scratchRef);
+            wordsOutput.addWord(scratchRef);
+          }
+        }
+        directory.writeMetadata(words.size(), maxHorizontalContext, fstMetaData);
+        return directory.readMap();
+      }
+      FST<BytesRef> fst = FST.fromFSTReader(fstMetaData, fstCompiler.getFSTReader());
       return new SynonymMap(fst, words, maxHorizontalContext);
     }
   }
 
+  abstract static class SynonymDictionary {
+    public abstract void get(int id, BytesRef scratch) throws IOException;
+  }
+
   /**
    * Abstraction for parsing synonym files.
    *

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.OffHeapFSTStore;
+
+/**
+ * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the
+ * FST and output words are kept off-heap.
+ *
+ * @lucene.experimental
+ */
+public class SynonymMapDirectory implements Closeable {
+  private final SynonymMapFormat synonymMapFormat =
+      new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less?
+  private final Directory directory;
+  private final List<Closeable> resources = new ArrayList<>();
+
+  public SynonymMapDirectory(Path path) throws IOException {
+    directory = FSDirectory.open(path);
+  }
+
+  IndexOutput fstOutput() throws IOException {
+    return synonymMapFormat.getFSTOutput(directory);
+  }
+
+  WordsOutput wordsOutput() throws IOException {
+    return synonymMapFormat.getWordsOutput(directory);
+  }
+
+  void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata)
+      throws IOException {
+    synonymMapFormat.writeMetadata(
+        directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata));
+  }
+
+  SynonymMap readMap() throws IOException {
+    CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory);
+    resources.add(closeableSynonymMap);
+    return closeableSynonymMap.map;
+  }
+
+  boolean hasSynonyms() throws IOException {
+    // TODO should take the path to the synonyms file to compare file hash against file used to
+    // build the directory
+    return directory.listAll().length > 0;
+  }
+
+  @Override
+  public void close() throws IOException {
+    for (Closeable c : resources) {
+      c.close();
+    }
+    directory.close();
+  }
+
+  /**
+   * Abstraction to support writing individual output words to the directory. Should be closed after
+   * the last word is written.
+   */
+  abstract static class WordsOutput implements Closeable {
+    public abstract void addWord(BytesRef word) throws IOException;
+  }
+
+  private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable {
+    @Override
+    public void close() throws IOException {
+      indexInput.close();
+    }
+  }
+
+  private record SynonymMetadata(
+      int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {}
+
+  private static class SynonymMapFormat {
+    private static final String FST_FILE = "synonyms.fst";
+    private static final String WORDS_FILE = "synonyms.wrd";
+    private static final String METADATA_FILE = "synonyms.mdt";
+
+    private IndexOutput getFSTOutput(Directory directory) throws IOException {
+      return directory.createOutput(FST_FILE, IOContext.DEFAULT);
+    }
+
+    private WordsOutput getWordsOutput(Directory directory) throws IOException {
+      IndexOutput wordsOutput = directory.createOutput(WORDS_FILE, IOContext.DEFAULT);
+      return new WordsOutput() {
+        @Override
+        public void close() throws IOException {
+          wordsOutput.close();
+        }
+
+        @Override
+        public void addWord(BytesRef word) throws IOException {
+          wordsOutput.writeVInt(word.length);
+          wordsOutput.writeBytes(word.bytes, word.offset, word.length);
+        }
+      };
+    }
+    ;
+
+    private void writeMetadata(Directory directory, SynonymMetadata synonymMetadata)
+        throws IOException {
+      try (IndexOutput metadataOutput = directory.createOutput(METADATA_FILE, IOContext.DEFAULT)) {
+        metadataOutput.writeVInt(synonymMetadata.wordCount);
+        metadataOutput.writeVInt(synonymMetadata.maxHorizontalContext);
+        synonymMetadata.fstMetadata.save(metadataOutput);
+      }
+      directory.sync(List.of(FST_FILE, WORDS_FILE, METADATA_FILE));
+    }
+
+    private SynonymMetadata readMetadata(Directory directory) throws IOException {
+      try (IndexInput metadataInput = directory.openInput(METADATA_FILE, IOContext.READONCE)) {
+        int wordCount = metadataInput.readVInt();
+        int maxHorizontalContext = metadataInput.readVInt();
+        FST.FSTMetadata<BytesRef> fstMetadata =
+            FST.readMetadata(metadataInput, ByteSequenceOutputs.getSingleton());
+        return new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata);
+      }
+    }
+
+    private CloseableSynonymMap readSynonymMap(Directory directory) throws IOException {
+      SynonymMetadata synonymMetadata = readMetadata(directory);
+      IndexInput in = directory.openInput(FST_FILE, IOContext.DEFAULT);
+      FST<BytesRef> fst =
+          FST.fromFSTReader(
+              synonymMetadata.fstMetadata, new OffHeapFSTStore(in, 0, synonymMetadata.fstMetadata));
+      OnHeapSynonymDictionary words;
+      try (IndexInput wordsInput = directory.openInput(WORDS_FILE, IOContext.DEFAULT)) {
+        words = new OnHeapSynonymDictionary(synonymMetadata.wordCount, wordsInput);
+      }
+      SynonymMap map = new SynonymMap(fst, words, synonymMetadata.maxHorizontalContext);
+      return new CloseableSynonymMap(map, in);
+    }
+
+    private static class OnHeapSynonymDictionary extends SynonymMap.SynonymDictionary {
+      private final int[] bytesStartArray;
+      private final byte[] wordBytes;
+
+      private OnHeapSynonymDictionary(int wordCount, IndexInput wordsFile) throws IOException {
+        bytesStartArray = new int[wordCount + 1];
+        int pos = 0;
+        for (int i = 0; i < wordCount; i++) {
+          bytesStartArray[i] = pos;
+          int size = wordsFile.readVInt();
+          pos += size;
+          wordsFile.seek(wordsFile.getFilePointer() + size);
+        }
+        bytesStartArray[wordCount] = pos;
+        wordsFile.seek(0);
+        wordBytes = new byte[pos];
+        for (int i = 0; i < wordCount; i++) {
+          int size = wordsFile.readVInt();
+          wordsFile.readBytes(wordBytes, bytesStartArray[i], size);
+        }
+      }
+
+      @Override
+      public void get(int id, BytesRef scratch) {
+        scratch.bytes = wordBytes;
+        scratch.offset = bytesStartArray[id];
+        scratch.length = bytesStartArray[id + 1] - bytesStartArray[id];
+      }
+    }
+  }
+}