Skip to content

Commit 246ac4b

Browse files
cbuescherromseygeek
authored andcommitted
Add ordering of files in compound files (#12241)
Today there is no specific ordering of how files are written to a compound file. The current order is determined by iterating over the set of file names in SegmentInfo, which is undefined. This commit changes to an order based on file size. Colocating data from files that are smaller (typically metadata files like terms index, field info etc...) but accessed often can help when parts of these files are held in cache.
1 parent 615f456 commit 246ac4b

File tree

3 files changed

+101
-2
lines changed

3 files changed

+101
-2
lines changed

lucene/CHANGES.txt

+2
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ Optimizations
6969

7070
* GITHUB#12198, GITHUB#12199: Reduced contention when indexing with many threads. (Adrien Grand)
7171

72+
* GITHUB#12241: Add ordering of files in compound files. (Christoph Büscher)
73+
7274
Bug Fixes
7375
---------------------
7476

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java

+32-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.lucene.store.Directory;
2828
import org.apache.lucene.store.IOContext;
2929
import org.apache.lucene.store.IndexOutput;
30+
import org.apache.lucene.util.PriorityQueue;
3031

3132
/**
3233
* Lucene 9.0 compound file format
@@ -102,11 +103,40 @@ public void write(Directory dir, SegmentInfo si, IOContext context) throws IOExc
102103
}
103104
}
104105

106+
private static class SizedFile {
107+
private final String name;
108+
private final long length;
109+
110+
private SizedFile(String name, long length) {
111+
this.name = name;
112+
this.length = length;
113+
}
114+
}
115+
116+
private static class SizedFileQueue extends PriorityQueue<SizedFile> {
117+
SizedFileQueue(int maxSize) {
118+
super(maxSize);
119+
}
120+
121+
@Override
122+
protected boolean lessThan(SizedFile sf1, SizedFile sf2) {
123+
return sf1.length < sf2.length;
124+
}
125+
}
126+
105127
private void writeCompoundFile(
106128
IndexOutput entries, IndexOutput data, Directory dir, SegmentInfo si) throws IOException {
107129
// write number of files
108-
entries.writeVInt(si.files().size());
109-
for (String file : si.files()) {
130+
int numFiles = si.files().size();
131+
entries.writeVInt(numFiles);
132+
// first put files in ascending size order so small files fit more likely into one page
133+
SizedFileQueue pq = new SizedFileQueue(numFiles);
134+
for (String filename : si.files()) {
135+
pq.add(new SizedFile(filename, dir.fileLength(filename)));
136+
}
137+
while (pq.size() > 0) {
138+
SizedFile sizedFile = pq.pop();
139+
String file = sizedFile.name;
110140
// align file start offset
111141
long startOffset = data.alignFilePointer(Long.BYTES);
112142
// write bytes for file

lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java

+67
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,17 @@
1616
*/
1717
package org.apache.lucene.codecs.lucene90;
1818

19+
import java.io.IOException;
20+
import java.util.ArrayList;
21+
import java.util.Collections;
22+
import java.util.List;
1923
import org.apache.lucene.codecs.Codec;
24+
import org.apache.lucene.codecs.CodecUtil;
25+
import org.apache.lucene.index.IndexFileNames;
26+
import org.apache.lucene.index.SegmentInfo;
27+
import org.apache.lucene.store.ChecksumIndexInput;
28+
import org.apache.lucene.store.Directory;
29+
import org.apache.lucene.store.IOContext;
2030
import org.apache.lucene.tests.index.BaseCompoundFormatTestCase;
2131
import org.apache.lucene.tests.util.TestUtil;
2232

@@ -27,4 +37,61 @@ public class TestLucene90CompoundFormat extends BaseCompoundFormatTestCase {
2737
protected Codec getCodec() {
2838
return codec;
2939
}
40+
41+
public void testFileLengthOrdering() throws IOException {
42+
Directory dir = newDirectory();
43+
// Setup the test segment
44+
String segment = "_123";
45+
int chunk = 1024; // internal buffer size used by the stream
46+
SegmentInfo si = newSegmentInfo(dir, segment);
47+
byte[] segId = si.getId();
48+
List<String> orderedFiles = new ArrayList<>();
49+
int randomFileSize = random().nextInt(chunk);
50+
for (int i = 0; i < 10; i++) {
51+
String filename = segment + "." + i;
52+
createRandomFile(dir, filename, randomFileSize, segId);
53+
// increase the next files size by a random amount
54+
randomFileSize += random().nextInt(100) + 1;
55+
orderedFiles.add(filename);
56+
}
57+
List<String> shuffledFiles = new ArrayList<>(orderedFiles);
58+
Collections.shuffle(shuffledFiles, random());
59+
si.setFiles(shuffledFiles);
60+
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
61+
62+
// entries file should contain files ordered by their size
63+
String entriesFileName =
64+
IndexFileNames.segmentFileName(si.name, "", Lucene90CompoundFormat.ENTRIES_EXTENSION);
65+
try (ChecksumIndexInput entriesStream =
66+
dir.openChecksumInput(entriesFileName, IOContext.READ)) {
67+
Throwable priorE = null;
68+
try {
69+
CodecUtil.checkIndexHeader(
70+
entriesStream,
71+
Lucene90CompoundFormat.ENTRY_CODEC,
72+
Lucene90CompoundFormat.VERSION_START,
73+
Lucene90CompoundFormat.VERSION_CURRENT,
74+
si.getId(),
75+
"");
76+
final int numEntries = entriesStream.readVInt();
77+
long lastOffset = 0;
78+
long lastLength = 0;
79+
for (int i = 0; i < numEntries; i++) {
80+
final String id = entriesStream.readString();
81+
assertEquals(orderedFiles.get(i), segment + id);
82+
long offset = entriesStream.readLong();
83+
assertTrue(offset > lastOffset);
84+
lastOffset = offset;
85+
long length = entriesStream.readLong();
86+
assertTrue(length >= lastLength);
87+
lastLength = length;
88+
}
89+
} catch (Throwable exception) {
90+
priorE = exception;
91+
} finally {
92+
CodecUtil.checkFooter(entriesStream, priorE);
93+
}
94+
}
95+
dir.close();
96+
}
3097
}

0 commit comments

Comments
 (0)