Skip to content

Commit 4cb2037

Browse files
authoredMar 27, 2024
MINOR: Add retry mechanism to EOS example (#15561)
In the initial EOS example, a retry logic was implemented within the resetToLastCommittedPositions method. During refactoring, this logic was removed becasue a poison pill prevented the example from reaching the final phase of consuming from the output topic. In this change, I suggest to add it back, but with a retry limit defined as MAX_RETRIES. Once this limit is reached, the problematic batch will be logged and skipped, allowing the processor to move on and process remaining records. If some records are skipped, the example will still hit the hard timeout (2 minutes), but after consuming all processed records. Reviewers: Luke Chen <showuon@gmail.com>
1 parent 9326476 commit 4cb2037

File tree

3 files changed

+53
-6
lines changed

3 files changed

+53
-6
lines changed
 

‎examples/src/main/java/kafka/examples/Consumer.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ public void run() {
109109
}
110110
}
111111
} catch (Throwable e) {
112-
Utils.printOut("Unhandled exception");
112+
Utils.printErr("Unhandled exception");
113113
e.printStackTrace();
114114
}
115115
Utils.printOut("Fetched %d records", numRecords - remainingRecords);

‎examples/src/main/java/kafka/examples/ExactlyOnceMessageProcessor.java

+51-4
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
import java.util.ArrayList;
3838
import java.util.Collection;
39+
import java.util.Collections;
3940
import java.util.HashMap;
4041
import java.util.Map;
4142
import java.util.Optional;
@@ -49,6 +50,8 @@
4950
* This class implements a read-process-write application.
5051
*/
5152
public class ExactlyOnceMessageProcessor extends Thread implements ConsumerRebalanceListener, AutoCloseable {
53+
private static final int MAX_RETRIES = 5;
54+
5255
private final String bootstrapServers;
5356
private final String inputTopic;
5457
private final String outputTopic;
@@ -103,19 +106,21 @@ public ExactlyOnceMessageProcessor(String threadName,
103106

104107
@Override
105108
public void run() {
109+
int retries = 0;
106110
int processedRecords = 0;
107111
long remainingRecords = Long.MAX_VALUE;
112+
108113
// it is recommended to have a relatively short txn timeout in order to clear pending offsets faster
109114
int transactionTimeoutMs = 10_000;
110115
// consumer must be in read_committed mode, which means it won't be able to read uncommitted data
111116
boolean readCommitted = true;
117+
112118
try (KafkaProducer<Integer, String> producer = new Producer("processor-producer", bootstrapServers, outputTopic,
113119
true, transactionalId, true, -1, transactionTimeoutMs, null).createKafkaProducer();
114120
KafkaConsumer<Integer, String> consumer = new Consumer("processor-consumer", bootstrapServers, inputTopic,
115121
"processor-group", Optional.of(groupInstanceId), readCommitted, -1, null).createKafkaConsumer()) {
116122
// called first and once to fence zombies and abort any pending transaction
117123
producer.initTransactions();
118-
119124
consumer.subscribe(singleton(inputTopic), this);
120125

121126
Utils.printOut("Processing new records");
@@ -140,6 +145,7 @@ public void run() {
140145
// commit the transaction including offsets
141146
producer.commitTransaction();
142147
processedRecords += records.count();
148+
retries = 0;
143149
}
144150
} catch (AuthorizationException | UnsupportedVersionException | ProducerFencedException
145151
| FencedInstanceIdException | OutOfOrderSequenceException | SerializationException e) {
@@ -151,18 +157,21 @@ public void run() {
151157
Utils.printOut("Invalid or no offset found, using latest");
152158
consumer.seekToEnd(emptyList());
153159
consumer.commitSync();
160+
retries = 0;
154161
} catch (KafkaException e) {
155-
// abort the transaction and try to continue
156-
Utils.printOut("Aborting transaction: %s", e);
162+
// abort the transaction
163+
Utils.printOut("Aborting transaction: %s", e.getMessage());
157164
producer.abortTransaction();
165+
retries = maybeRetry(retries, consumer);
158166
}
167+
159168
remainingRecords = getRemainingRecords(consumer);
160169
if (remainingRecords != Long.MAX_VALUE) {
161170
Utils.printOut("Remaining records: %d", remainingRecords);
162171
}
163172
}
164173
} catch (Throwable e) {
165-
Utils.printOut("Unhandled exception");
174+
Utils.printErr("Unhandled exception");
166175
e.printStackTrace();
167176
}
168177
Utils.printOut("Processed %d records", processedRecords);
@@ -215,6 +224,44 @@ private long getRemainingRecords(KafkaConsumer<Integer, String> consumer) {
215224
}).sum();
216225
}
217226

227+
/**
228+
* When we get a generic {@code KafkaException} while processing records, we retry up to {@code MAX_RETRIES} times.
229+
* If we exceed this threshold, we log an error and move on to the next batch of records.
230+
* In a real world application you may want to to send these records to a dead letter topic (DLT) for further processing.
231+
*
232+
* @param retries Current number of retries
233+
* @param consumer Consumer instance
234+
* @return Updated number of retries
235+
*/
236+
private int maybeRetry(int retries, KafkaConsumer<Integer, String> consumer) {
237+
if (retries < 0) {
238+
Utils.printErr("The number of retries must be greater than zero");
239+
shutdown();
240+
}
241+
242+
if (retries < MAX_RETRIES) {
243+
// retry: reset fetch offset
244+
// the consumer fetch position needs to be restored to the committed offset before the transaction started
245+
Map<TopicPartition, OffsetAndMetadata> committed = consumer.committed(consumer.assignment());
246+
consumer.assignment().forEach(tp -> {
247+
OffsetAndMetadata offsetAndMetadata = committed.get(tp);
248+
if (offsetAndMetadata != null) {
249+
consumer.seek(tp, offsetAndMetadata.offset());
250+
} else {
251+
consumer.seekToBeginning(Collections.singleton(tp));
252+
}
253+
});
254+
retries++;
255+
} else {
256+
// continue: skip records
257+
// the consumer fetch position needs to be committed as if records were processed successfully
258+
Utils.printErr("Skipping records after %d retries", MAX_RETRIES);
259+
consumer.commitSync();
260+
retries = 0;
261+
}
262+
return retries;
263+
}
264+
218265
@Override
219266
public void close() throws Exception {
220267
if (producer != null) {

‎examples/src/main/java/kafka/examples/Producer.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ public void run() {
8989
sentRecords++;
9090
}
9191
} catch (Throwable e) {
92-
Utils.printOut("Unhandled exception");
92+
Utils.printErr("Unhandled exception");
9393
e.printStackTrace();
9494
}
9595
Utils.printOut("Sent %d records", sentRecords);

0 commit comments

Comments
 (0)
Please sign in to comment.