Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,8 @@ private void parse() throws IOException {
}

// Interleaves all output tokens onto the futureOutputs:
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset)
throws IOException {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

final int code = bytesReader.readVInt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ private boolean parse() throws IOException {
* Expands the output graph into the necessary tokens, adding synonyms as side paths parallel to
* the input tokens, and buffers them in the output token buffer.
*/
private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
private void bufferOutputTokens(BytesRef bytes, int matchInputLength) throws IOException {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

final int code = bytesReader.readVInt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Iterator;
Expand Down Expand Up @@ -85,6 +86,7 @@ public class SynonymGraphFilterFactory extends TokenFilterFactory implements Res
private final boolean expand;
private final String analyzerName;
private final Map<String, String> tokArgs = new HashMap<>();
private final Path compiledSynonymsPath;

private SynonymMap map;

Expand Down Expand Up @@ -113,6 +115,9 @@ public SynonymGraphFilterFactory(Map<String, String> args) {
itr.remove();
}
}
String compiledSynonymsPathArg = get(args, "compiledSynonymsPath");
compiledSynonymsPath =
compiledSynonymsPathArg == null ? null : Path.of(compiledSynonymsPathArg);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
Expand Down Expand Up @@ -168,29 +173,36 @@ protected TokenStreamComponents createComponents(String fieldName) {
protected SynonymMap loadSynonyms(
ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer)
throws IOException, ParseException {
CharsetDecoder decoder =
StandardCharsets.UTF_8
.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

SynonymMap.Parser parser;
Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
try {
parser =
clazz
.getConstructor(boolean.class, boolean.class, Analyzer.class)
.newInstance(dedup, expand, analyzer);
} catch (Exception e) {
throw new RuntimeException(e);
SynonymMapDirectory compiledSynonymsDirectory = null;
if (compiledSynonymsPath != null) {
compiledSynonymsDirectory = new SynonymMapDirectory(compiledSynonymsPath);
}
if (compiledSynonymsDirectory == null || compiledSynonymsDirectory.hasSynonyms() == false) {
CharsetDecoder decoder =
StandardCharsets.UTF_8
.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

SynonymMap.Parser parser;
Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
try {
parser =
clazz
.getConstructor(boolean.class, boolean.class, Analyzer.class)
.newInstance(dedup, expand, analyzer);
} catch (Exception e) {
throw new RuntimeException(e);
}

List<String> files = splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
parser.parse(new InputStreamReader(loader.openResource(file), decoder));
List<String> files = splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
parser.parse(new InputStreamReader(loader.openResource(file), decoder));
}
return parser.build(compiledSynonymsDirectory);
}
return parser.build();
return compiledSynonymsDirectory.readMap();
}

// (there are no tests for this functionality)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.internal.hppc.IntHashSet;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefHash;
Expand All @@ -53,12 +54,24 @@ public class SynonymMap {
public final FST<BytesRef> fst;

/** map&lt;ord, outputword&gt; */
public final BytesRefHash words;
public final SynonymDictionary words;

/** maxHorizontalContext: maximum context we need on the tokenstream */
public final int maxHorizontalContext;

public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) {
this(
fst,
new SynonymDictionary() {
@Override
public void get(int id, BytesRef scratch) {
words.get(id, scratch);
}
},
maxHorizontalContext);
}

SynonymMap(FST<BytesRef> fst, SynonymDictionary words, int maxHorizontalContext) {
this.fst = fst;
this.words = words;
this.maxHorizontalContext = maxHorizontalContext;
Expand Down Expand Up @@ -218,12 +231,26 @@ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
add(input, countWords(input), output, countWords(output), includeOrig);
}

/** Builds an {@link SynonymMap} and returns it. */
/** Buils a {@link SynonymMap} and returns it. */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, waaaaay up above, the javadoc for Builder, it mentions FSTSynonymMap twice -- can you fix those to SynonymMap instead? That must be holdover from ancient naming...

Also, it's a bit annoying that GH does not allow me to put comments on parts of the code you did not change :) I guess this is GH's appempt to keep me in "eyes on the prize" mode ... so I only comment on stuff changed in the PR.

public SynonymMap build() throws IOException {
return build(null);
}

/**
* Builds a {@link SynonymMap} and returns it. If directory is non-null, it will write the
* compiled SynonymMap to disk and return an off-heap version.
*/
public SynonymMap build(SynonymMapDirectory directory) throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ability to save a SynonymMap is new to your PR, right? We cannot save/load them today? So this is a nice new additional feature (in addition to off-heap option, and a nice side effect of it) in your PR?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When implementing the new build() method which accepts a directory path parameter for off-heap SynonymMap storage, I encountered FileAlreadyExistsException despite implementing a unique directory creation mechanism using System.currentTimeMillis().

Current implementation looks like something like this:

String synonymPath = ".../lucene/src/build/temp-FST"; 
Path dirPath = Path.of(synonymPath);
SynonymMap synonymMap;
        try {
            // Create directory if it doesn't exist
            if (!Files.exists(dirPath)) {
                Files.createDirectories(dirPath);
            }
            // Create a unique directory for this run
            Path uniqueDirPath = dirPath.resolve("synonyms_" + System.currentTimeMillis());
            Files.createDirectory(uniqueDirPath);
            synonymMap = builder.build(new SynonymMapDirectory(uniqueDirPath)); 
            ...

Error observed:

Caused by: java.nio.file.FileAlreadyExistsException: /temp-FST/synonyms_1745894622943
    at org.apache.lucene.analysis.FilteringTokenFilter.incrementToken\\n
    at org.apache.lucene.index.IndexingChain$PerField.invertTokenStream(IndexingChain.java:1205)\\n	 
    at org.apache.lucene.index.IndexingChain$PerField.invert(IndexingChain.java:1183)\\n	
    at org.apache.lucene.index.IndexingChain.processField(IndexingChain.java:735)\\n	
    at org.apache.lucene.index.IndexingChain.processDocument(IndexingChain.java:609)\\n	
    at org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments(DocumentsWriterPerThread.java:263)\\n	
    at org.apache.lucene.index.DocumentsWriter.updateDocuments(DocumentsWriter.java:425)\\n	 
    at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1562)\\n	
    at org.apache.lucene.index.IndexWriter.addDocuments(IndexWriter.java:1520)\
    at org.apache.lucene.analysis.Analyzer$TokenStreamComponents.setReader(Analyzer.java:391)\\n	 
    at org.apache.lucene.analysis.Analyzer.tokenStream(Analyzer.java:160)\\n	
    at org.apache.lucene.analysis.FilteringTokenFilter.incrementToken(FilteringTokenFilter.java:51)\\n	 
    at org.apache.lucene.index.IndexingChain$PerField.invertTokenStream(IndexingChain.java:1205)\\n	 
    at org.apache.lucene.index.IndexingChain$PerField.invert(IndexingChain.java:1183)\\n	
    at org.apache.lucene.index.IndexingChain.processField(IndexingChain.java:735)\\n	
    at org.apache.lucene.index.IndexingChain.processDocument(IndexingChain.java:609)\\n	
    at org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments(DocumentsWriterPerThread.java:263)\\n	
    at org.apache.lucene.index.DocumentsWriter.updateDocuments(DocumentsWriter.java:425)\\n	 
    at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1562)\\n	
    at org.apache.lucene.index.IndexWriter.addDocuments(IndexWriter.java:1520)\\n	
    ... [truncated]```

The exception occurs during concurrent synonym map creation,  appearing multiple times in the logs. Would appreciate guidance on proper handling of concurrent directory creation for off-heap storage.

ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options?
FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
FSTCompiler.Builder<BytesRef> fstCompilerBuilder =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
IndexOutput fstOutput = null;
if (directory != null) {
fstOutput = directory.fstOutput();
fstCompilerBuilder.dataOutput(fstOutput);
}
FSTCompiler<BytesRef> fstCompiler = fstCompilerBuilder.build();

BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
Expand Down Expand Up @@ -290,11 +317,28 @@ public SynonymMap build() throws IOException {
fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
}

FST<BytesRef> fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
FST.FSTMetadata<BytesRef> fstMetaData = fstCompiler.compile();
if (directory != null) {
fstOutput.close(); // TODO -- Should fstCompiler.compile take care of this?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the idea is a caller could in theory write multiple FSTs into a single IndexOutput (remove the TODO?)?

try (SynonymMapDirectory.WordsOutput wordsOutput = directory.wordsOutput()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A better on-disk layout might be to write a single big byte[] blob for all words, and then something like the cool "linear fit" encoding that MonotonicLongValues uses on-disk. This would be more compact and faster to load and maybe more options of what is on/off heap, etc.

But save all that for later! vInt prefix length encoding is fine for starters! Progress not perfection!

BytesRef scratchRef = new BytesRef();
for (int i = 0; i < words.size(); i++) {
words.get(i, scratchRef);
wordsOutput.addWord(scratchRef);
}
}
directory.writeMetadata(words.size(), maxHorizontalContext, fstMetaData);
return directory.readMap();
}
FST<BytesRef> fst = FST.fromFSTReader(fstMetaData, fstCompiler.getFSTReader());
return new SynonymMap(fst, words, maxHorizontalContext);
}
}

abstract static class SynonymDictionary {
public abstract void get(int id, BytesRef scratch) throws IOException;
}

/**
* Abstraction for parsing synonym files.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.synonym;

import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;

/**
* Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm any reason why it must be an FSDirectory? Can it just be any Directory? Do we really rely on filesystem backing somehow? It looks like we are just using Lucene's standard IndexInput/Output...

* FST and output words are kept off-heap.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the user control separately whether FST and output words are off heap or not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point, if you're using SynonymMapDirectory, you get off-heap FST and on-heap words.

If you don't use SynonymMapDirectory (i.e. you're using the existing constructor or the arg-less version of SynonymMap.Builder.build()), then everything is on-heap like before.

The numbers I posted in #13054 (comment) (which, granted, was just a single synthetic benchmark) seemed to suggest (to me, at least) that the "sweet spot" is off-heap FST with on-heap words. The performance hit from moving words off-heap (at least with my implementation) was pretty bad. Lots of seeking involved. Also, the vast majority of heap savings came from moving the FST.

I'm happy to bring back off-heap words as an option if we think someone would be willing to take that perf hit for slightly lower heap utilization.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm can you fix the javadoc above to explain that words are on-heap and FST is off-heap?

+1 to default to that sweet spot.

I wonder in practice what the "typical" size of FST vs words is? Like does the FST dominate the storage?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder in practice what the "typical" size of FST vs words is? Like does the FST dominate the storage?

Aha, you answered this in an earlier comment:

The on-heap FST seems to occupy about 36MB of heap. The off-heap FST with on-heap words occupies about 560kB. The off-heap FST with off-heap words occupies about 150kB.

It's wild that the FST is so much larger than the words... I'm not yet understanding why.

*
* @lucene.experimental
*/
public class SynonymMapDirectory implements Closeable {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe mark this with @lucene.experimental so we are free to change the API within non-major releases?

Or: could this be package private? Does the user need to create this wrapper themselves for some reason?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking that the user would create a SynonymMapDirectory and pass it to SynonymMap.Builder.build(SynonymMapDirectory) as the way of opting in to off-heap FSTs for their synonyms.

Given the issue you called out below regarding the need to close the IndexInput for the FST, I feel like the user needs to hold onto "something" (other than the SynonymMap) that gives them an obligation to close filesystem resources when they're done.

Alternatively, I'd be happy to make SynonymMap implement Closeable. Then I'd probably just ask the user to specify a Path instead. At that point, we could hide SynonymMapDirectory altogether.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we ever want to bring back the off-heap words, it seems like SynonymMapDirectory is the way to go, because we need to store two "things" in this directory? Or, were we stuffing both FST and words into a single file when you had words off-heap too?

private final SynonymMapFormat synonymMapFormat =
new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less?
private final Directory directory;
private final List<Closeable> resources = new ArrayList<>();

public SynonymMapDirectory(Path path) throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to store several SynonymMaps in one Directory? Or one must make a separate Directory for each? That's maybe fine ... e.g. one could make a FilterDirectory impl that can share a single underlying filesystem directory and distinguish the files by e.g. a unique filename prefix or so.

Copy link
Contributor Author

@msfroh msfroh Aug 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, since I split the synonyms across three files (synonyms.mdt, synonyms.wrd, and synonyms.fst), I assumed that there would be a single synonym map per directory.

That said, I suppose it wouldn't be hard to combine those three into a single file (with a .syn extension, say), where the SynonymMapDirectory could look at a prefix. Specifically, the current implementation reads the metadata and words once (keeping the words on-heap), then spends the rest of its time in the FST.

Then a single filesystem directory could have something like:

first_synonyms.syn
second_synonyms.syn
... etc. ...

What do you think? (I'm also happy to let each serialized SynonymMap live in its own directory.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's leave it as is for now (three separate files)? But let's mark things @lucene.experimental to reserve the right to change APIs.

Hmm, also: will these synonym files be backwards compatible across releases? Across major releases? I would say we should not promise across major releases? Furthermore, we should enforce that not-promise, by writing the major release into the metadata somewhere and checking if that changed between writing and reading and throw a clear exception if so?

Within minor releases maybe we allow backcompat? If so, we need to add some testing to confirm syns written in 10.x are still readable/usable in 10.y?

directory = FSDirectory.open(path);
}

IndexOutput fstOutput() throws IOException {
return synonymMapFormat.getFSTOutput(directory);
}

WordsOutput wordsOutput() throws IOException {
return synonymMapFormat.getWordsOutput(directory);
}

void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata)
throws IOException {
synonymMapFormat.writeMetadata(
directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata));
}

SynonymMap readMap() throws IOException {
CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory);
resources.add(closeableSynonymMap);
return closeableSynonymMap.map;
}

boolean hasSynonyms() throws IOException {
// TODO should take the path to the synonyms file to compare file hash against file used to
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoa, what would this TODO achieve? Is it somehow trying to check if the compiled synonyms have become stale relative to the original source synonyms (a "make" like capability)? We don't know down here whether the original source synonyms are backed by a file...

// build the directory
return directory.listAll().length > 0;
}

@Override
public void close() throws IOException {
for (Closeable c : resources) {
c.close();
}
directory.close();
}

/**
* Abstraction to support writing individual output words to the directory. Should be closed after
* the last word is written.
*/
abstract static class WordsOutput implements Closeable {
public abstract void addWord(BytesRef word) throws IOException;
}

private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable {
@Override
public void close() throws IOException {
indexInput.close();
}
}

private record SynonymMetadata(
int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {}

private static class SynonymMapFormat {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to keep this format private for now.

private static final String FST_FILE = "synonyms.fst";
private static final String WORDS_FILE = "synonyms.wrd";
private static final String METADATA_FILE = "synonyms.mdt";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh yes up to three files, so I think it makes sense for user to pass Directory (not Path).


private IndexOutput getFSTOutput(Directory directory) throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to createFSTOutput? Make it clear we are always creating a new one, not getting an existing one?

return directory.createOutput(FST_FILE, IOContext.DEFAULT);
}

private WordsOutput getWordsOutput(Directory directory) throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

createWordsOutput?

IndexOutput wordsOutput = directory.createOutput(WORDS_FILE, IOContext.DEFAULT);
return new WordsOutput() {
@Override
public void close() throws IOException {
wordsOutput.close();
}

@Override
public void addWord(BytesRef word) throws IOException {
wordsOutput.writeVInt(word.length);
wordsOutput.writeBytes(word.bytes, word.offset, word.length);
}
};
}
;

private void writeMetadata(Directory directory, SynonymMetadata synonymMetadata)
throws IOException {
try (IndexOutput metadataOutput = directory.createOutput(METADATA_FILE, IOContext.DEFAULT)) {
metadataOutput.writeVInt(synonymMetadata.wordCount);
metadataOutput.writeVInt(synonymMetadata.maxHorizontalContext);
synonymMetadata.fstMetadata.save(metadataOutput);
}
directory.sync(List.of(FST_FILE, WORDS_FILE, METADATA_FILE));
}

private SynonymMetadata readMetadata(Directory directory) throws IOException {
try (IndexInput metadataInput = directory.openInput(METADATA_FILE, IOContext.READONCE)) {
int wordCount = metadataInput.readVInt();
int maxHorizontalContext = metadataInput.readVInt();
FST.FSTMetadata<BytesRef> fstMetadata =
FST.readMetadata(metadataInput, ByteSequenceOutputs.getSingleton());
return new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata);
}
}

private CloseableSynonymMap readSynonymMap(Directory directory) throws IOException {
SynonymMetadata synonymMetadata = readMetadata(directory);
IndexInput in = directory.openInput(FST_FILE, IOContext.DEFAULT);
FST<BytesRef> fst =
FST.fromFSTReader(
synonymMetadata.fstMetadata, new OffHeapFSTStore(in, 0, synonymMetadata.fstMetadata));
OnHeapSynonymDictionary words;
try (IndexInput wordsInput = directory.openInput(WORDS_FILE, IOContext.DEFAULT)) {
words = new OnHeapSynonymDictionary(synonymMetadata.wordCount, wordsInput);
}
SynonymMap map = new SynonymMap(fst, words, synonymMetadata.maxHorizontalContext);
return new CloseableSynonymMap(map, in);
}

private static class OnHeapSynonymDictionary extends SynonymMap.SynonymDictionary {
private final int[] bytesStartArray;
private final byte[] wordBytes;

private OnHeapSynonymDictionary(int wordCount, IndexInput wordsFile) throws IOException {
bytesStartArray = new int[wordCount + 1];
int pos = 0;
for (int i = 0; i < wordCount; i++) {
bytesStartArray[i] = pos;
int size = wordsFile.readVInt();
pos += size;
wordsFile.seek(wordsFile.getFilePointer() + size);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use .skipBytes here instead?

}
bytesStartArray[wordCount] = pos;
wordsFile.seek(0);
wordBytes = new byte[pos];
for (int i = 0; i < wordCount; i++) {
int size = wordsFile.readVInt();
wordsFile.readBytes(wordBytes, bytesStartArray[i], size);
}
}

@Override
public void get(int id, BytesRef scratch) {
scratch.bytes = wordBytes;
scratch.offset = bytesStartArray[id];
scratch.length = bytesStartArray[id + 1] - bytesStartArray[id];
}
}
}
}
Loading