|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
1 | 18 | package org.apache.lucene.analysis.synonym;
|
2 | 19 |
|
| 20 | +import java.io.Closeable; |
| 21 | +import java.io.IOException; |
| 22 | +import java.nio.file.Path; |
| 23 | +import java.util.List; |
3 | 24 | import org.apache.lucene.store.Directory;
|
4 | 25 | import org.apache.lucene.store.FSDirectory;
|
5 | 26 | import org.apache.lucene.store.IOContext;
|
6 | 27 | import org.apache.lucene.store.IndexInput;
|
7 | 28 | import org.apache.lucene.store.IndexOutput;
|
8 |
| -import org.apache.lucene.util.ArrayUtil; |
9 | 29 | import org.apache.lucene.util.BytesRef;
|
10 |
| -import org.apache.lucene.util.BytesRefBuilder; |
11 | 30 | import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
12 | 31 | import org.apache.lucene.util.fst.FST;
|
13 | 32 | import org.apache.lucene.util.fst.OffHeapFSTStore;
|
14 | 33 |
|
15 |
| -import java.io.Closeable; |
16 |
| -import java.io.IOException; |
17 |
| -import java.nio.file.Path; |
18 |
| -import java.util.List; |
19 |
| - |
| 34 | +/** |
| 35 | + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the |
| 36 | + * FST and output words are kept off-heap. |
| 37 | + */ |
20 | 38 | public class SynonymMapDirectory implements Closeable {
|
21 |
| - private final SynonymMapFormat synonymMapFormat = new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? |
22 |
| - private final Directory directory; |
23 |
| - public SynonymMapDirectory(Path path) throws IOException { |
24 |
| - directory = FSDirectory.open(path); |
25 |
| - } |
26 |
| - |
27 |
| - public IndexOutput fstOutput() throws IOException { |
28 |
| - return synonymMapFormat.getFSTOutput(directory); |
| 39 | + private final SynonymMapFormat synonymMapFormat = |
| 40 | + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? |
| 41 | + private final Directory directory; |
| 42 | + |
| 43 | + public SynonymMapDirectory(Path path) throws IOException { |
| 44 | + directory = FSDirectory.open(path); |
| 45 | + } |
| 46 | + |
| 47 | + public IndexOutput fstOutput() throws IOException { |
| 48 | + return synonymMapFormat.getFSTOutput(directory); |
| 49 | + } |
| 50 | + |
| 51 | + public WordsOutput wordsOutput() throws IOException { |
| 52 | + return synonymMapFormat.getWordsOutput(directory); |
| 53 | + } |
| 54 | + |
| 55 | + public void writeMetadata(int wordCount, int maxHorizontalContext, FST<BytesRef> fst) |
| 56 | + throws IOException { |
| 57 | + synonymMapFormat.writeMetadata(directory, wordCount, maxHorizontalContext, fst); |
| 58 | + } |
| 59 | + |
| 60 | + public SynonymMap readMap() throws IOException { |
| 61 | + return synonymMapFormat.readSynonymMap(directory); |
| 62 | + } |
| 63 | + |
| 64 | + public boolean hasSynonyms() throws IOException { |
| 65 | + // TODO should take the path to the synonyms file to compare file hash against file used to |
| 66 | + // build the directory |
| 67 | + return directory.listAll().length > 0; |
| 68 | + } |
| 69 | + |
| 70 | + @Override |
| 71 | + public void close() throws IOException { |
| 72 | + directory.close(); |
| 73 | + } |
| 74 | + |
| 75 | + /** |
| 76 | + * Abstraction to support writing individual output words to the directory. Should be closed after |
| 77 | + * the last word is written. |
| 78 | + */ |
| 79 | + public abstract static class WordsOutput implements Closeable { |
| 80 | + public abstract void addWord(BytesRef word) throws IOException; |
| 81 | + } |
| 82 | + |
| 83 | + private static class SynonymMapFormat { |
| 84 | + private static final String FST_FILE = "synonyms.fst"; |
| 85 | + private static final String WORDS_FILE = "synonyms.wrd"; |
| 86 | + private static final String METADATA_FILE = "synonyms.mdt"; |
| 87 | + |
| 88 | + public IndexOutput getFSTOutput(Directory directory) throws IOException { |
| 89 | + return directory.createOutput(FST_FILE, IOContext.DEFAULT); |
29 | 90 | }
|
30 | 91 |
|
31 |
| - public WordsOutput wordsOutput() throws IOException { |
32 |
| - return synonymMapFormat.getWordsOutput(directory); |
33 |
| - } |
34 |
| - |
35 |
| - public void writeMetadata(int wordCount, int maxHorizontalContext, FST<BytesRef> fst) throws IOException { |
36 |
| - synonymMapFormat.writeMetadata(directory, wordCount, maxHorizontalContext, fst); |
37 |
| - } |
| 92 | + public WordsOutput getWordsOutput(Directory directory) throws IOException { |
| 93 | + IndexOutput wordsOutput = directory.createOutput(WORDS_FILE, IOContext.DEFAULT); |
| 94 | + return new WordsOutput() { |
| 95 | + @Override |
| 96 | + public void close() throws IOException { |
| 97 | + wordsOutput.close(); |
| 98 | + } |
38 | 99 |
|
39 |
| - public SynonymMap readMap() throws IOException { |
40 |
| - return synonymMapFormat.readSynonymMap(directory); |
| 100 | + @Override |
| 101 | + public void addWord(BytesRef word) throws IOException { |
| 102 | + wordsOutput.writeVInt(word.length); |
| 103 | + wordsOutput.writeBytes(word.bytes, word.offset, word.length); |
| 104 | + } |
| 105 | + }; |
41 | 106 | }
|
42 |
| - |
43 |
| - public boolean hasSynonyms() throws IOException { |
44 |
| - // TODO should take the path to the synonyms file to compare file hash against file used to build the directory |
45 |
| - return directory.listAll().length > 0; |
| 107 | + ; |
| 108 | + |
| 109 | + public void writeMetadata( |
| 110 | + Directory directory, int wordCount, int maxHorizontalContext, FST<BytesRef> fst) |
| 111 | + throws IOException { |
| 112 | + try (IndexOutput metadataOutput = directory.createOutput(METADATA_FILE, IOContext.DEFAULT)) { |
| 113 | + metadataOutput.writeVInt(wordCount); |
| 114 | + metadataOutput.writeVInt(maxHorizontalContext); |
| 115 | + fst.saveMetadata(metadataOutput); |
| 116 | + } |
| 117 | + directory.sync(List.of(FST_FILE, WORDS_FILE, METADATA_FILE)); |
46 | 118 | }
|
47 | 119 |
|
48 |
| - @Override |
49 |
| - public void close() throws IOException { |
50 |
| - directory.close(); |
| 120 | + private SynonymMetadata readMetadata(Directory directory) throws IOException { |
| 121 | + try (IndexInput metadataInput = directory.openInput(METADATA_FILE, IOContext.READONCE)) { |
| 122 | + int wordCount = metadataInput.readVInt(); |
| 123 | + int maxHorizontalContext = metadataInput.readVInt(); |
| 124 | + FST.FSTMetadata<BytesRef> fstMetadata = |
| 125 | + FST.readMetadata(metadataInput, ByteSequenceOutputs.getSingleton()); |
| 126 | + return new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata); |
| 127 | + } |
51 | 128 | }
|
52 | 129 |
|
53 |
| - public static abstract class WordsOutput implements Closeable { |
54 |
| - public abstract void addWord(BytesRef word) throws IOException; |
| 130 | + public SynonymMap readSynonymMap(Directory directory) throws IOException { |
| 131 | + SynonymMetadata synonymMetadata = readMetadata(directory); |
| 132 | + FST<BytesRef> fst = |
| 133 | + new FST<>( |
| 134 | + synonymMetadata.fstMetadata, |
| 135 | + directory.openInput(FST_FILE, IOContext.DEFAULT), |
| 136 | + new OffHeapFSTStore()); |
| 137 | + IndexInput wordsInput = directory.openInput(WORDS_FILE, IOContext.READ); |
| 138 | + int[] bytesStartArray = new int[synonymMetadata.wordCount]; |
| 139 | + for (int i = 0; i < synonymMetadata.wordCount; i++) { |
| 140 | + bytesStartArray[i] = Math.toIntExact(wordsInput.getFilePointer()); |
| 141 | + int length = wordsInput.readVInt(); |
| 142 | + wordsInput.seek(wordsInput.getFilePointer() + length); |
| 143 | + } |
| 144 | + return new SynonymMap( |
| 145 | + fst, |
| 146 | + new OffHeapBytesRefHashLike(bytesStartArray, wordsInput), |
| 147 | + synonymMetadata.maxHorizontalContext); |
55 | 148 | }
|
56 | 149 |
|
57 |
| - private static class SynonymMapFormat { |
58 |
| - private static final String FST_FILE = "synonyms.fst"; |
59 |
| - private static final String WORDS_FILE = "synonyms.wrd"; |
60 |
| - private static final String METADATA_FILE = "synonyms.mdt"; |
61 |
| - |
62 |
| - public IndexOutput getFSTOutput(Directory directory) throws IOException { |
63 |
| - return directory.createOutput(FST_FILE, IOContext.DEFAULT); |
64 |
| - } |
65 |
| - |
66 |
| - public WordsOutput getWordsOutput(Directory directory) throws IOException { |
67 |
| - IndexOutput wordsOutput = directory.createOutput(WORDS_FILE, IOContext.DEFAULT); |
68 |
| - return new WordsOutput() { |
69 |
| - @Override |
70 |
| - public void close() throws IOException { |
71 |
| - wordsOutput.close(); |
72 |
| - } |
73 |
| - |
74 |
| - @Override |
75 |
| - public void addWord(BytesRef word) throws IOException { |
76 |
| - wordsOutput.writeVInt(word.length); |
77 |
| - wordsOutput.writeBytes(word.bytes, word.offset, word.length); |
78 |
| - } |
79 |
| - }; |
80 |
| - }; |
81 |
| - |
82 |
| - public void writeMetadata(Directory directory, int wordCount, int maxHorizontalContext, FST<BytesRef> fst) throws IOException { |
83 |
| - try (IndexOutput metadataOutput = directory.createOutput(METADATA_FILE, IOContext.DEFAULT)) { |
84 |
| - metadataOutput.writeVInt(wordCount); |
85 |
| - metadataOutput.writeVInt(maxHorizontalContext); |
86 |
| - fst.saveMetadata(metadataOutput); |
87 |
| - } |
88 |
| - directory.sync(List.of(FST_FILE, WORDS_FILE, METADATA_FILE)); |
89 |
| - } |
90 |
| - |
91 |
| - private SynonymMetadata readMetadata(Directory directory) throws IOException { |
92 |
| - try (IndexInput metadataInput = directory.openInput(METADATA_FILE, IOContext.READONCE)) { |
93 |
| - int wordCount = metadataInput.readVInt(); |
94 |
| - int maxHorizontalContext = metadataInput.readVInt(); |
95 |
| - FST.FSTMetadata<BytesRef> fstMetadata = FST.readMetadata(metadataInput, ByteSequenceOutputs.getSingleton()); |
96 |
| - return new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata); |
97 |
| - } |
| 150 | + private static class OffHeapBytesRefHashLike extends SynonymMap.BytesRefHashLike { |
| 151 | + private final int[] bytesStartArray; |
| 152 | + private final IndexInput wordsFile; |
| 153 | + |
| 154 | + public OffHeapBytesRefHashLike(int[] bytesStartArray, IndexInput wordsFile) { |
| 155 | + this.bytesStartArray = bytesStartArray; |
| 156 | + this.wordsFile = wordsFile; |
| 157 | + } |
| 158 | + |
| 159 | + @Override |
| 160 | + public void get(int id, BytesRef scratch) throws IOException { |
| 161 | + wordsFile.seek(bytesStartArray[id]); |
| 162 | + int length = wordsFile.readVInt(); |
| 163 | + if (scratch.bytes.length < length) { |
| 164 | + scratch.bytes = new byte[length]; |
98 | 165 | }
|
| 166 | + wordsFile.readBytes(scratch.bytes, 0, length); |
| 167 | + scratch.offset = 0; |
| 168 | + scratch.length = length; |
| 169 | + } |
| 170 | + } |
99 | 171 |
|
100 |
| - public SynonymMap readSynonymMap(Directory directory) throws IOException { |
101 |
| - SynonymMetadata synonymMetadata = readMetadata(directory); |
102 |
| - FST<BytesRef> fst = new FST<>(synonymMetadata.fstMetadata, directory.openInput(FST_FILE, IOContext.DEFAULT), new OffHeapFSTStore()); |
103 |
| - IndexInput wordsInput = directory.openInput(WORDS_FILE, IOContext.READ); |
104 |
| - int[] bytesStartArray = new int[synonymMetadata.wordCount]; |
105 |
| - for (int i = 0; i < synonymMetadata.wordCount; i++) { |
106 |
| - bytesStartArray[i] = Math.toIntExact(wordsInput.getFilePointer()); |
107 |
| - int length = wordsInput.readVInt(); |
108 |
| - wordsInput.seek(wordsInput.getFilePointer() + length); |
109 |
| - } |
110 |
| - return new SynonymMap(fst, new OffHeapBytesRefHashLike(bytesStartArray, wordsInput), synonymMetadata.maxHorizontalContext); |
111 |
| - } |
112 |
| - |
113 |
| - private static class OffHeapBytesRefHashLike extends SynonymMap.BytesRefHashLike { |
114 |
| - private final int[] bytesStartArray; |
115 |
| - private final IndexInput wordsFile; |
116 |
| - |
117 |
| - public OffHeapBytesRefHashLike(int[] bytesStartArray, IndexInput wordsFile) { |
118 |
| - this.bytesStartArray = bytesStartArray; |
119 |
| - this.wordsFile = wordsFile; |
120 |
| - } |
121 |
| - |
122 |
| - @Override |
123 |
| - public void get(int id, BytesRef scratch) throws IOException { |
124 |
| - wordsFile.seek(bytesStartArray[id]); |
125 |
| - int length = wordsFile.readVInt(); |
126 |
| - if (scratch.bytes.length < length) { |
127 |
| - scratch.bytes = new byte[length]; |
128 |
| - } |
129 |
| - wordsFile.readBytes(scratch.bytes, 0, length); |
130 |
| - scratch.offset = 0; |
131 |
| - scratch.length = length; |
132 |
| - } |
133 |
| - } |
134 |
| - |
135 |
| - private static class SynonymMetadata { |
136 |
| - final int wordCount; |
137 |
| - final int maxHorizontalContext; |
138 |
| - final FST.FSTMetadata<BytesRef> fstMetadata; |
139 |
| - |
140 |
| - SynonymMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) { |
141 |
| - this.wordCount = wordCount; |
142 |
| - this.maxHorizontalContext = maxHorizontalContext; |
143 |
| - this.fstMetadata = fstMetadata; |
144 |
| - } |
145 |
| - } |
| 172 | + private static class SynonymMetadata { |
| 173 | + final int wordCount; |
| 174 | + final int maxHorizontalContext; |
| 175 | + final FST.FSTMetadata<BytesRef> fstMetadata; |
| 176 | + |
| 177 | + SynonymMetadata( |
| 178 | + int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) { |
| 179 | + this.wordCount = wordCount; |
| 180 | + this.maxHorizontalContext = maxHorizontalContext; |
| 181 | + this.fstMetadata = fstMetadata; |
| 182 | + } |
146 | 183 | }
|
| 184 | + } |
147 | 185 | }
|
0 commit comments