Skip to content

Commit 6ac4e0b

Browse files
authored
SimpleText codec to support writing byte vectors (#12111)
A recent test failure signaled that when the simple text codec was randomly selected, byte vectors could not be written. This commit addressed that by adding support for writing byte vectors to SimpleTextKnnVectorsWriter. Note that while support is added to the BufferingKnnVectorsWriter base class, 90, 91 and 92 writers don't need to support byte vectors and will throw unsupported operation exception when attempting to do that.
1 parent e1a7dfe commit 6ac4e0b

File tree

8 files changed

+270
-146
lines changed

8 files changed

+270
-146
lines changed

lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java

+9-5
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import java.util.Arrays;
2626
import org.apache.lucene.codecs.BufferingKnnVectorsWriter;
2727
import org.apache.lucene.codecs.CodecUtil;
28-
import org.apache.lucene.codecs.KnnVectorsReader;
28+
import org.apache.lucene.index.ByteVectorValues;
2929
import org.apache.lucene.index.FieldInfo;
3030
import org.apache.lucene.index.FloatVectorValues;
3131
import org.apache.lucene.index.IndexFileNames;
@@ -107,10 +107,9 @@ public final class Lucene90HnswVectorsWriter extends BufferingKnnVectorsWriter {
107107
}
108108

109109
@Override
110-
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, int maxDoc)
110+
public void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, int maxDoc)
111111
throws IOException {
112112
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
113-
FloatVectorValues vectors = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
114113

115114
IndexOutput tempVectorData =
116115
segmentWriteState.directory.createTempOutput(
@@ -120,7 +119,7 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
120119
try {
121120
// write the vector data to a temporary file
122121
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
123-
int[] docIds = writeVectorData(tempVectorData, vectors);
122+
int[] docIds = writeVectorData(tempVectorData, floatVectorValues);
124123
CodecUtil.writeFooter(tempVectorData);
125124
IOUtils.close(tempVectorData);
126125

@@ -134,7 +133,7 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
134133
// build the graph using the temporary vector data
135134
Lucene90HnswVectorsReader.OffHeapFloatVectorValues offHeapVectors =
136135
new Lucene90HnswVectorsReader.OffHeapFloatVectorValues(
137-
vectors.dimension(), docIds, vectorDataInput);
136+
floatVectorValues.dimension(), docIds, vectorDataInput);
138137

139138
long[] offsets = new long[docIds.length];
140139
long vectorIndexOffset = vectorIndex.getFilePointer();
@@ -170,6 +169,11 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
170169
}
171170
}
172171

172+
@Override
173+
protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, int maxDoc) {
174+
throw new UnsupportedOperationException("byte vectors not supported in this version");
175+
}
176+
173177
/**
174178
* Writes the vector values to the output and returns a mapping from dense ordinals to document
175179
* IDs. The length of the returned array matches the total number of documents with a vector

lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java

+9-5
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import java.util.Arrays;
2626
import org.apache.lucene.codecs.BufferingKnnVectorsWriter;
2727
import org.apache.lucene.codecs.CodecUtil;
28-
import org.apache.lucene.codecs.KnnVectorsReader;
28+
import org.apache.lucene.index.ByteVectorValues;
2929
import org.apache.lucene.index.DocsWithFieldSet;
3030
import org.apache.lucene.index.FieldInfo;
3131
import org.apache.lucene.index.FloatVectorValues;
@@ -109,10 +109,9 @@ public final class Lucene91HnswVectorsWriter extends BufferingKnnVectorsWriter {
109109
}
110110

111111
@Override
112-
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, int maxDoc)
112+
public void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, int maxDoc)
113113
throws IOException {
114114
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
115-
FloatVectorValues vectors = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
116115

117116
IndexOutput tempVectorData =
118117
segmentWriteState.directory.createTempOutput(
@@ -121,7 +120,7 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
121120
boolean success = false;
122121
try {
123122
// write the vector data to a temporary file
124-
DocsWithFieldSet docsWithField = writeVectorData(tempVectorData, vectors);
123+
DocsWithFieldSet docsWithField = writeVectorData(tempVectorData, floatVectorValues);
125124
CodecUtil.writeFooter(tempVectorData);
126125
IOUtils.close(tempVectorData);
127126

@@ -139,7 +138,7 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
139138
// TODO: separate random access vector values from DocIdSetIterator?
140139
Lucene91HnswVectorsReader.OffHeapFloatVectorValues offHeapVectors =
141140
new Lucene91HnswVectorsReader.OffHeapFloatVectorValues(
142-
vectors.dimension(), docsWithField.cardinality(), null, vectorDataInput);
141+
floatVectorValues.dimension(), docsWithField.cardinality(), null, vectorDataInput);
143142
Lucene91OnHeapHnswGraph graph =
144143
offHeapVectors.size() == 0
145144
? null
@@ -167,6 +166,11 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
167166
}
168167
}
169168

169+
@Override
170+
protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, int maxDoc) {
171+
throw new UnsupportedOperationException("byte vectors not supported in this version");
172+
}
173+
170174
/**
171175
* Writes the vector values to the output and returns a set of documents that contains vectors.
172176
*/

lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java

+12-11
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
import java.util.Arrays;
2727
import org.apache.lucene.codecs.BufferingKnnVectorsWriter;
2828
import org.apache.lucene.codecs.CodecUtil;
29-
import org.apache.lucene.codecs.KnnVectorsReader;
3029
import org.apache.lucene.codecs.lucene90.IndexedDISI;
30+
import org.apache.lucene.index.ByteVectorValues;
3131
import org.apache.lucene.index.DocsWithFieldSet;
3232
import org.apache.lucene.index.FieldInfo;
3333
import org.apache.lucene.index.FloatVectorValues;
@@ -115,10 +115,9 @@ public final class Lucene92HnswVectorsWriter extends BufferingKnnVectorsWriter {
115115
}
116116

117117
@Override
118-
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, int maxDoc)
118+
public void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, int maxDoc)
119119
throws IOException {
120120
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
121-
FloatVectorValues vectors = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
122121

123122
IndexOutput tempVectorData =
124123
segmentWriteState.directory.createTempOutput(
@@ -127,7 +126,7 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
127126
boolean success = false;
128127
try {
129128
// write the vector data to a temporary file
130-
DocsWithFieldSet docsWithField = writeVectorData(tempVectorData, vectors);
129+
DocsWithFieldSet docsWithField = writeVectorData(tempVectorData, floatVectorValues);
131130
CodecUtil.writeFooter(tempVectorData);
132131
IOUtils.close(tempVectorData);
133132

@@ -146,12 +145,11 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
146145
// TODO: separate random access vector values from DocIdSetIterator?
147146
OffHeapFloatVectorValues offHeapVectors =
148147
new OffHeapFloatVectorValues.DenseOffHeapVectorValues(
149-
vectors.dimension(), docsWithField.cardinality(), vectorDataInput);
148+
floatVectorValues.dimension(), docsWithField.cardinality(), vectorDataInput);
150149
OnHeapHnswGraph graph =
151150
offHeapVectors.size() == 0
152151
? null
153-
: writeGraph(
154-
offHeapVectors, VectorEncoding.FLOAT32, fieldInfo.getVectorSimilarityFunction());
152+
: writeGraph(offHeapVectors, fieldInfo.getVectorSimilarityFunction());
155153
long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
156154
writeMeta(
157155
fieldInfo,
@@ -175,6 +173,11 @@ public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, i
175173
}
176174
}
177175

176+
@Override
177+
protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, int maxDoc) {
178+
throw new UnsupportedOperationException("byte vectors not supported in this version");
179+
}
180+
178181
/**
179182
* Writes the vector values to the output and returns a set of documents that contains vectors.
180183
*/
@@ -271,16 +274,14 @@ private void writeMeta(
271274
}
272275

273276
private OnHeapHnswGraph writeGraph(
274-
RandomAccessVectorValues<float[]> vectorValues,
275-
VectorEncoding vectorEncoding,
276-
VectorSimilarityFunction similarityFunction)
277+
RandomAccessVectorValues<float[]> vectorValues, VectorSimilarityFunction similarityFunction)
277278
throws IOException {
278279

279280
// build graph
280281
HnswGraphBuilder<float[]> hnswGraphBuilder =
281282
HnswGraphBuilder.create(
282283
vectorValues,
283-
vectorEncoding,
284+
VectorEncoding.FLOAT32,
284285
similarityFunction,
285286
M,
286287
beamWidth,

lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
405405
case FLOAT32:
406406
docsWithField =
407407
writeVectorData(
408-
tempVectorData, MergedVectorValues.mergeVectorValues(fieldInfo, mergeState));
408+
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
409409
break;
410410
default:
411411
throw new IllegalArgumentException(
@@ -680,7 +680,6 @@ public float[] copyValue(float[] value) {
680680
}
681681
}
682682

683-
@SuppressWarnings("unchecked")
684683
FieldWriter(FieldInfo fieldInfo, int M, int beamWidth, InfoStream infoStream)
685684
throws IOException {
686685
this.fieldInfo = fieldInfo;

lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java

+30-7
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import java.util.Arrays;
2525
import java.util.List;
2626
import org.apache.lucene.codecs.BufferingKnnVectorsWriter;
27-
import org.apache.lucene.codecs.KnnVectorsReader;
27+
import org.apache.lucene.index.ByteVectorValues;
2828
import org.apache.lucene.index.FieldInfo;
2929
import org.apache.lucene.index.FloatVectorValues;
3030
import org.apache.lucene.index.IndexFileNames;
@@ -73,28 +73,51 @@ public class SimpleTextKnnVectorsWriter extends BufferingKnnVectorsWriter {
7373
}
7474

7575
@Override
76-
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader, int maxDoc)
76+
public void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, int maxDoc)
7777
throws IOException {
78-
FloatVectorValues vectors = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
7978
long vectorDataOffset = vectorData.getFilePointer();
8079
List<Integer> docIds = new ArrayList<>();
81-
int docV;
82-
for (docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) {
83-
writeVectorValue(vectors);
80+
for (int docV = floatVectorValues.nextDoc();
81+
docV != NO_MORE_DOCS;
82+
docV = floatVectorValues.nextDoc()) {
83+
writeFloatVectorValue(floatVectorValues);
8484
docIds.add(docV);
8585
}
8686
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
8787
writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
8888
}
8989

90-
private void writeVectorValue(FloatVectorValues vectors) throws IOException {
90+
private void writeFloatVectorValue(FloatVectorValues vectors) throws IOException {
9191
// write vector value
9292
float[] value = vectors.vectorValue();
9393
assert value.length == vectors.dimension();
9494
write(vectorData, Arrays.toString(value));
9595
newline(vectorData);
9696
}
9797

98+
@Override
99+
public void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, int maxDoc)
100+
throws IOException {
101+
long vectorDataOffset = vectorData.getFilePointer();
102+
List<Integer> docIds = new ArrayList<>();
103+
for (int docV = byteVectorValues.nextDoc();
104+
docV != NO_MORE_DOCS;
105+
docV = byteVectorValues.nextDoc()) {
106+
writeByteVectorValue(byteVectorValues);
107+
docIds.add(docV);
108+
}
109+
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
110+
writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
111+
}
112+
113+
private void writeByteVectorValue(ByteVectorValues vectors) throws IOException {
114+
// write vector value
115+
byte[] value = vectors.vectorValue();
116+
assert value.length == vectors.dimension();
117+
write(vectorData, Arrays.toString(value));
118+
newline(vectorData);
119+
}
120+
98121
private void writeMeta(
99122
FieldInfo field, long vectorDataOffset, long vectorDataLength, List<Integer> docIds)
100123
throws IOException {

0 commit comments

Comments
 (0)