27
27
import org .apache .kafka .server .purgatory .DelayedOperation ;
28
28
import org .apache .kafka .server .share .SharePartitionKey ;
29
29
import org .apache .kafka .server .share .fetch .DelayedShareFetchGroupKey ;
30
+ import org .apache .kafka .server .share .fetch .PartitionMaxBytesStrategy ;
30
31
import org .apache .kafka .server .share .fetch .ShareFetch ;
31
32
import org .apache .kafka .server .storage .log .FetchIsolation ;
32
33
import org .apache .kafka .server .storage .log .FetchPartitionData ;
@@ -60,24 +61,35 @@ public class DelayedShareFetch extends DelayedOperation {
60
61
private final ShareFetch shareFetch ;
61
62
private final ReplicaManager replicaManager ;
62
63
private final BiConsumer <SharePartitionKey , Throwable > exceptionHandler ;
64
+ private final PartitionMaxBytesStrategy partitionMaxBytesStrategy ;
63
65
// The topic partitions that need to be completed for the share fetch request are given by sharePartitions.
64
66
// sharePartitions is a subset of shareFetchData. The order of insertion/deletion of entries in sharePartitions is important.
65
67
private final LinkedHashMap <TopicIdPartition , SharePartition > sharePartitions ;
66
- private LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > partitionsAcquired ;
68
+ private LinkedHashMap <TopicIdPartition , Long > partitionsAcquired ;
67
69
private LinkedHashMap <TopicIdPartition , LogReadResult > partitionsAlreadyFetched ;
68
70
69
71
DelayedShareFetch (
70
72
ShareFetch shareFetch ,
71
73
ReplicaManager replicaManager ,
72
74
BiConsumer <SharePartitionKey , Throwable > exceptionHandler ,
73
75
LinkedHashMap <TopicIdPartition , SharePartition > sharePartitions ) {
76
+ this (shareFetch , replicaManager , exceptionHandler , sharePartitions , PartitionMaxBytesStrategy .type (PartitionMaxBytesStrategy .StrategyType .UNIFORM ));
77
+ }
78
+
79
+ DelayedShareFetch (
80
+ ShareFetch shareFetch ,
81
+ ReplicaManager replicaManager ,
82
+ BiConsumer <SharePartitionKey , Throwable > exceptionHandler ,
83
+ LinkedHashMap <TopicIdPartition , SharePartition > sharePartitions ,
84
+ PartitionMaxBytesStrategy partitionMaxBytesStrategy ) {
74
85
super (shareFetch .fetchParams ().maxWaitMs , Optional .empty ());
75
86
this .shareFetch = shareFetch ;
76
87
this .replicaManager = replicaManager ;
77
88
this .partitionsAcquired = new LinkedHashMap <>();
78
89
this .partitionsAlreadyFetched = new LinkedHashMap <>();
79
90
this .exceptionHandler = exceptionHandler ;
80
91
this .sharePartitions = sharePartitions ;
92
+ this .partitionMaxBytesStrategy = partitionMaxBytesStrategy ;
81
93
}
82
94
83
95
@ Override
@@ -99,7 +111,7 @@ public void onComplete() {
99
111
partitionsAcquired .keySet ());
100
112
101
113
try {
102
- LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > topicPartitionData ;
114
+ LinkedHashMap <TopicIdPartition , Long > topicPartitionData ;
103
115
// tryComplete did not invoke forceComplete, so we need to check if we have any partitions to fetch.
104
116
if (partitionsAcquired .isEmpty ())
105
117
topicPartitionData = acquirablePartitions ();
@@ -121,11 +133,13 @@ public void onComplete() {
121
133
}
122
134
}
123
135
124
- private void completeShareFetchRequest (LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > topicPartitionData ) {
136
+ private void completeShareFetchRequest (LinkedHashMap <TopicIdPartition , Long > topicPartitionData ) {
125
137
try {
126
138
LinkedHashMap <TopicIdPartition , LogReadResult > responseData ;
127
139
if (partitionsAlreadyFetched .isEmpty ())
128
- responseData = readFromLog (topicPartitionData );
140
+ responseData = readFromLog (
141
+ topicPartitionData ,
142
+ partitionMaxBytesStrategy .maxBytes (shareFetch .fetchParams ().maxBytes , topicPartitionData .keySet (), topicPartitionData .size ()));
129
143
else
130
144
// There shouldn't be a case when we have a partitionsAlreadyFetched value here and this variable is getting
131
145
// updated in a different tryComplete thread.
@@ -158,7 +172,7 @@ private void completeShareFetchRequest(LinkedHashMap<TopicIdPartition, FetchRequ
158
172
*/
159
173
@ Override
160
174
public boolean tryComplete () {
161
- LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > topicPartitionData = acquirablePartitions ();
175
+ LinkedHashMap <TopicIdPartition , Long > topicPartitionData = acquirablePartitions ();
162
176
163
177
try {
164
178
if (!topicPartitionData .isEmpty ()) {
@@ -167,7 +181,7 @@ public boolean tryComplete() {
167
181
// those topic partitions.
168
182
LinkedHashMap <TopicIdPartition , LogReadResult > replicaManagerReadResponse = maybeReadFromLog (topicPartitionData );
169
183
maybeUpdateFetchOffsetMetadata (topicPartitionData , replicaManagerReadResponse );
170
- if (anyPartitionHasLogReadError (replicaManagerReadResponse ) || isMinBytesSatisfied (topicPartitionData )) {
184
+ if (anyPartitionHasLogReadError (replicaManagerReadResponse ) || isMinBytesSatisfied (topicPartitionData , partitionMaxBytesStrategy . maxBytes ( shareFetch . fetchParams (). maxBytes , topicPartitionData . keySet (), topicPartitionData . size ()) )) {
171
185
partitionsAcquired = topicPartitionData ;
172
186
partitionsAlreadyFetched = replicaManagerReadResponse ;
173
187
boolean completedByMe = forceComplete ();
@@ -202,28 +216,18 @@ public boolean tryComplete() {
202
216
* Prepare fetch request structure for partitions in the share fetch request for which we can acquire records.
203
217
*/
204
218
// Visible for testing
205
- LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > acquirablePartitions () {
219
+ LinkedHashMap <TopicIdPartition , Long > acquirablePartitions () {
206
220
// Initialize the topic partitions for which the fetch should be attempted.
207
- LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > topicPartitionData = new LinkedHashMap <>();
221
+ LinkedHashMap <TopicIdPartition , Long > topicPartitionData = new LinkedHashMap <>();
208
222
209
223
sharePartitions .forEach ((topicIdPartition , sharePartition ) -> {
210
- int partitionMaxBytes = shareFetch .partitionMaxBytes ().getOrDefault (topicIdPartition , 0 );
211
224
// Add the share partition to the list of partitions to be fetched only if we can
212
225
// acquire the fetch lock on it.
213
226
if (sharePartition .maybeAcquireFetchLock ()) {
214
227
try {
215
228
// If the share partition is already at capacity, we should not attempt to fetch.
216
229
if (sharePartition .canAcquireRecords ()) {
217
- topicPartitionData .put (
218
- topicIdPartition ,
219
- new FetchRequest .PartitionData (
220
- topicIdPartition .topicId (),
221
- sharePartition .nextFetchOffset (),
222
- 0 ,
223
- partitionMaxBytes ,
224
- Optional .empty ()
225
- )
226
- );
230
+ topicPartitionData .put (topicIdPartition , sharePartition .nextFetchOffset ());
227
231
} else {
228
232
sharePartition .releaseFetchLock ();
229
233
log .trace ("Record lock partition limit exceeded for SharePartition {}, " +
@@ -239,24 +243,28 @@ LinkedHashMap<TopicIdPartition, FetchRequest.PartitionData> acquirablePartitions
239
243
return topicPartitionData ;
240
244
}
241
245
242
- private LinkedHashMap <TopicIdPartition , LogReadResult > maybeReadFromLog (LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > topicPartitionData ) {
243
- LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > partitionsNotMatchingFetchOffsetMetadata = new LinkedHashMap <>();
244
- topicPartitionData .forEach ((topicIdPartition , partitionData ) -> {
246
+ private LinkedHashMap <TopicIdPartition , LogReadResult > maybeReadFromLog (LinkedHashMap <TopicIdPartition , Long > topicPartitionData ) {
247
+ LinkedHashMap <TopicIdPartition , Long > partitionsNotMatchingFetchOffsetMetadata = new LinkedHashMap <>();
248
+ topicPartitionData .forEach ((topicIdPartition , fetchOffset ) -> {
245
249
SharePartition sharePartition = sharePartitions .get (topicIdPartition );
246
- if (sharePartition .fetchOffsetMetadata (partitionData . fetchOffset ).isEmpty ()) {
247
- partitionsNotMatchingFetchOffsetMetadata .put (topicIdPartition , partitionData );
250
+ if (sharePartition .fetchOffsetMetadata (fetchOffset ).isEmpty ()) {
251
+ partitionsNotMatchingFetchOffsetMetadata .put (topicIdPartition , fetchOffset );
248
252
}
249
253
});
250
254
if (partitionsNotMatchingFetchOffsetMetadata .isEmpty ()) {
251
255
return new LinkedHashMap <>();
252
256
}
253
257
// We fetch data from replica manager corresponding to the topic partitions that have missing fetch offset metadata.
254
- return readFromLog (partitionsNotMatchingFetchOffsetMetadata );
258
+ // Although we are fetching partition max bytes for partitionsNotMatchingFetchOffsetMetadata,
259
+ // we will take acquired partitions size = topicPartitionData.size() because we do not want to let the
260
+ // leftover partitions to starve which will be fetched later.
261
+ return readFromLog (
262
+ partitionsNotMatchingFetchOffsetMetadata ,
263
+ partitionMaxBytesStrategy .maxBytes (shareFetch .fetchParams ().maxBytes , partitionsNotMatchingFetchOffsetMetadata .keySet (), topicPartitionData .size ()));
255
264
}
256
265
257
- private void maybeUpdateFetchOffsetMetadata (
258
- LinkedHashMap <TopicIdPartition , FetchRequest .PartitionData > topicPartitionData ,
259
- LinkedHashMap <TopicIdPartition , LogReadResult > replicaManagerReadResponseData ) {
266
+ private void maybeUpdateFetchOffsetMetadata (LinkedHashMap <TopicIdPartition , Long > topicPartitionData ,
267
+ LinkedHashMap <TopicIdPartition , LogReadResult > replicaManagerReadResponseData ) {
260
268
for (Map .Entry <TopicIdPartition , LogReadResult > entry : replicaManagerReadResponseData .entrySet ()) {
261
269
TopicIdPartition topicIdPartition = entry .getKey ();
262
270
SharePartition sharePartition = sharePartitions .get (topicIdPartition );
@@ -267,17 +275,18 @@ private void maybeUpdateFetchOffsetMetadata(
267
275
continue ;
268
276
}
269
277
sharePartition .updateFetchOffsetMetadata (
270
- topicPartitionData .get (topicIdPartition ). fetchOffset ,
278
+ topicPartitionData .get (topicIdPartition ),
271
279
replicaManagerLogReadResult .info ().fetchOffsetMetadata );
272
280
}
273
281
}
274
282
275
283
// minByes estimation currently assumes the common case where all fetched data is acquirable.
276
- private boolean isMinBytesSatisfied (LinkedHashMap <TopicIdPartition , FetchRequest .PartitionData > topicPartitionData ) {
284
+ private boolean isMinBytesSatisfied (LinkedHashMap <TopicIdPartition , Long > topicPartitionData ,
285
+ LinkedHashMap <TopicIdPartition , Integer > partitionMaxBytes ) {
277
286
long accumulatedSize = 0 ;
278
- for (Map .Entry <TopicIdPartition , FetchRequest . PartitionData > entry : topicPartitionData .entrySet ()) {
287
+ for (Map .Entry <TopicIdPartition , Long > entry : topicPartitionData .entrySet ()) {
279
288
TopicIdPartition topicIdPartition = entry .getKey ();
280
- FetchRequest . PartitionData partitionData = entry .getValue ();
289
+ long fetchOffset = entry .getValue ();
281
290
282
291
LogOffsetMetadata endOffsetMetadata ;
283
292
try {
@@ -294,7 +303,7 @@ private boolean isMinBytesSatisfied(LinkedHashMap<TopicIdPartition, FetchRequest
294
303
295
304
SharePartition sharePartition = sharePartitions .get (topicIdPartition );
296
305
297
- Optional <LogOffsetMetadata > optionalFetchOffsetMetadata = sharePartition .fetchOffsetMetadata (partitionData . fetchOffset );
306
+ Optional <LogOffsetMetadata > optionalFetchOffsetMetadata = sharePartition .fetchOffsetMetadata (fetchOffset );
298
307
if (optionalFetchOffsetMetadata .isEmpty () || optionalFetchOffsetMetadata .get () == LogOffsetMetadata .UNKNOWN_OFFSET_METADATA )
299
308
continue ;
300
309
LogOffsetMetadata fetchOffsetMetadata = optionalFetchOffsetMetadata .get ();
@@ -312,7 +321,7 @@ private boolean isMinBytesSatisfied(LinkedHashMap<TopicIdPartition, FetchRequest
312
321
return true ;
313
322
} else if (fetchOffsetMetadata .onSameSegment (endOffsetMetadata )) {
314
323
// we take the partition fetch size as upper bound when accumulating the bytes.
315
- long bytesAvailable = Math .min (endOffsetMetadata .positionDiff (fetchOffsetMetadata ), partitionData . maxBytes );
324
+ long bytesAvailable = Math .min (endOffsetMetadata .positionDiff (fetchOffsetMetadata ), partitionMaxBytes . get ( topicIdPartition ) );
316
325
accumulatedSize += bytesAvailable ;
317
326
}
318
327
}
@@ -335,13 +344,25 @@ else if (isolationType == FetchIsolation.HIGH_WATERMARK)
335
344
336
345
}
337
346
338
- private LinkedHashMap <TopicIdPartition , LogReadResult > readFromLog (LinkedHashMap <TopicIdPartition , FetchRequest .PartitionData > topicPartitionData ) {
347
+ private LinkedHashMap <TopicIdPartition , LogReadResult > readFromLog (LinkedHashMap <TopicIdPartition , Long > topicPartitionFetchOffsets ,
348
+ LinkedHashMap <TopicIdPartition , Integer > partitionMaxBytes ) {
339
349
// Filter if there already exists any erroneous topic partition.
340
- Set <TopicIdPartition > partitionsToFetch = shareFetch .filterErroneousTopicPartitions (topicPartitionData .keySet ());
350
+ Set <TopicIdPartition > partitionsToFetch = shareFetch .filterErroneousTopicPartitions (topicPartitionFetchOffsets .keySet ());
341
351
if (partitionsToFetch .isEmpty ()) {
342
352
return new LinkedHashMap <>();
343
353
}
344
354
355
+ LinkedHashMap <TopicIdPartition , FetchRequest .PartitionData > topicPartitionData = new LinkedHashMap <>();
356
+
357
+ topicPartitionFetchOffsets .forEach ((topicIdPartition , fetchOffset ) -> topicPartitionData .put (topicIdPartition ,
358
+ new FetchRequest .PartitionData (
359
+ topicIdPartition .topicId (),
360
+ fetchOffset ,
361
+ 0 ,
362
+ partitionMaxBytes .get (topicIdPartition ),
363
+ Optional .empty ())
364
+ ));
365
+
345
366
Seq <Tuple2 <TopicIdPartition , LogReadResult >> responseLogResult = replicaManager .readFromLog (
346
367
shareFetch .fetchParams (),
347
368
CollectionConverters .asScala (
@@ -390,18 +411,21 @@ private void handleFetchException(
390
411
}
391
412
392
413
// Visible for testing.
393
- LinkedHashMap <TopicIdPartition , LogReadResult > combineLogReadResponse (LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > topicPartitionData ,
394
- LinkedHashMap <TopicIdPartition , LogReadResult > existingFetchedData ) {
395
- LinkedHashMap <TopicIdPartition , FetchRequest . PartitionData > missingLogReadTopicPartitions = new LinkedHashMap <>();
396
- topicPartitionData .forEach ((topicIdPartition , partitionData ) -> {
414
+ LinkedHashMap <TopicIdPartition , LogReadResult > combineLogReadResponse (LinkedHashMap <TopicIdPartition , Long > topicPartitionData ,
415
+ LinkedHashMap <TopicIdPartition , LogReadResult > existingFetchedData ) {
416
+ LinkedHashMap <TopicIdPartition , Long > missingLogReadTopicPartitions = new LinkedHashMap <>();
417
+ topicPartitionData .forEach ((topicIdPartition , fetchOffset ) -> {
397
418
if (!existingFetchedData .containsKey (topicIdPartition )) {
398
- missingLogReadTopicPartitions .put (topicIdPartition , partitionData );
419
+ missingLogReadTopicPartitions .put (topicIdPartition , fetchOffset );
399
420
}
400
421
});
401
422
if (missingLogReadTopicPartitions .isEmpty ()) {
402
423
return existingFetchedData ;
403
424
}
404
- LinkedHashMap <TopicIdPartition , LogReadResult > missingTopicPartitionsLogReadResponse = readFromLog (missingLogReadTopicPartitions );
425
+
426
+ LinkedHashMap <TopicIdPartition , LogReadResult > missingTopicPartitionsLogReadResponse = readFromLog (
427
+ missingLogReadTopicPartitions ,
428
+ partitionMaxBytesStrategy .maxBytes (shareFetch .fetchParams ().maxBytes , missingLogReadTopicPartitions .keySet (), topicPartitionData .size ()));
405
429
missingTopicPartitionsLogReadResponse .putAll (existingFetchedData );
406
430
return missingTopicPartitionsLogReadResponse ;
407
431
}
0 commit comments