[kafka] feat: Adding end_time functionality to kafka_consumer

lfagliano · lfagliano · commit d29f1b941a07 · 2025-05-19T08:25:07.000+02:00
THis commit adds an end_time functionality to the kafka consumer
function which makes it more batch-processing friendly, as it allows the
user to achieve indempotency
diff --git a/sources/kafka/__init__.py b/sources/kafka/__init__.py
@@ -35,6 +35,7 @@ def kafka_consumer(
     batch_size: Optional[int] = 3000,
     batch_timeout: Optional[int] = 3,
     start_from: Optional[TAnyDateTime] = None,
+    end_time: Optional[TAnyDateTime] = None,
 ) -> Iterable[TDataItem]:
     """Extract recent messages from the given Kafka topics.
 
@@ -56,6 +57,8 @@ def kafka_consumer(
             consume, in seconds.
         start_from (Optional[TAnyDateTime]): A timestamp, at which to start
             reading. Older messages are ignored.
+        end_time (Optional[TAnyDateTime]): A timestamp, at which to stop
+            reading. Newer messages are ignored.
 
     Yields:
         Iterable[TDataItem]: Kafka messages.
@@ -78,7 +81,23 @@ def kafka_consumer(
     if start_from is not None:
         start_from = ensure_pendulum_datetime(start_from)
 
-    tracker = OffsetTracker(consumer, topics, dlt.current.resource_state(), start_from)
+    if end_time is not None:
+        end_time = ensure_pendulum_datetime(end_time)
+
+        if start_from is None:
+            raise ValueError("`start_from` must be provided if `end_time` is provided")
+
+        if start_from > end_time:
+            raise ValueError("`start_from` must be before `end_time`")
+
+        tracker = OffsetTracker(
+            consumer, topics, dlt.current.resource_state(), start_from, end_time
+        )
+
+    else:
+        tracker = OffsetTracker(
+            consumer, topics, dlt.current.resource_state(), start_from
+        )
 
     # read messages up to the maximum offsets,
     # not waiting for new messages
@@ -97,7 +116,19 @@ def kafka_consumer(
                     else:
                         raise err
                 else:
-                    batch.append(msg_processor(msg))
-                    tracker.renew(msg)
+                    topic = msg.topic()
+                    partition = str(msg.partition())
+                    current_offset = msg.offset()
+                    max_offset = tracker[topic][partition]["max"]
+
+                    # Only process the message if it's within the partition's max offset
+                    if current_offset < max_offset:
+                        batch.append(msg_processor(msg))
+                        tracker.renew(msg)
+                    else:
+                        logger.info(
+                            f"Skipping message on {topic} partition {partition} at offset {current_offset} "
+                            f"- beyond max offset {max_offset}"
+                        )
 
             yield batch
diff --git a/sources/kafka/helpers.py b/sources/kafka/helpers.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, List
 
 from confluent_kafka import Consumer, Message, TopicPartition  # type: ignore
-from confluent_kafka.admin import AdminClient, TopicMetadata  # type: ignore
+from confluent_kafka.admin import TopicMetadata  # type: ignore
 
 from dlt import config, secrets
 from dlt.common import pendulum
@@ -54,23 +54,26 @@ def default_msg_processor(msg: Message) -> Dict[str, Any]:
 class OffsetTracker(dict):  # type: ignore
     """Object to control offsets of the given topics.
 
-    Tracks all the partitions of the given topics with two params:
-    current offset and maximum offset (partition length).
+    Tracks all the partitions of the given topics with three params:
+    current offset, maximum offset (partition length), and an end time.
 
     Args:
         consumer (confluent_kafka.Consumer): Kafka consumer.
         topic_names (List): Names of topics to track.
         pl_state (DictStrAny): Pipeline current state.
         start_from (Optional[pendulum.DateTime]): A timestamp, after which messages
             are read. Older messages are ignored.
+        end_time (Optional[pendulum.DateTime]): A timestamp, before which messages
+            are read. Newer messages are ignored.
     """
 
     def __init__(
         self,
         consumer: Consumer,
         topic_names: List[str],
         pl_state: DictStrAny,
-        start_from: pendulum.DateTime = None,
+        start_from: Optional[pendulum.DateTime] = None,
+        end_time: Optional[pendulum.DateTime] = None,
     ):
         super().__init__()
 
@@ -82,7 +85,7 @@ def __init__(
             "offsets", {t_name: {} for t_name in topic_names}
         )
 
-        self._init_partition_offsets(start_from)
+        self._init_partition_offsets(start_from, end_time)
 
     def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
         """Read the given topics metadata from Kafka.
@@ -104,7 +107,11 @@ def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
 
         return tracked_topics
 
-    def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
+    def _init_partition_offsets(
+        self,
+        start_from: Optional[pendulum.DateTime] = None,
+        end_time: Optional[pendulum.DateTime] = None,
+    ) -> None:
         """Designate current and maximum offsets for every partition.
 
         Current offsets are read from the state, if present. Set equal
@@ -113,6 +120,8 @@ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
         Args:
             start_from (pendulum.DateTime): A timestamp, at which to start
                 reading. Older messages are ignored.
+            end_time (pendulum.DateTime): A timestamp, before which messages
+                are read. Newer messages are ignored.
         """
         all_parts = []
         for t_name, topic in self._topics.items():
@@ -128,27 +137,49 @@ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
                 for part in topic.partitions
             ]
 
-            # get offsets for the timestamp, if given
-            if start_from is not None:
+            # get offsets for the timestamp ranges, if given
+            if start_from is not None and end_time is not None:
+                start_ts_offsets = self._consumer.offsets_for_times(parts)
+                end_ts_offsets = self._consumer.offsets_for_times(
+                    [
+                        TopicPartition(t_name, part, end_time.int_timestamp * 1000)
+                        for part in topic.partitions
+                    ]
+                )
+            elif start_from is not None:
                 ts_offsets = self._consumer.offsets_for_times(parts)
 
             # designate current and maximum offsets for every partition
             for i, part in enumerate(parts):
                 max_offset = self._consumer.get_watermark_offsets(part)[1]
 
-                if start_from is not None:
+                if start_from is not None and end_time is not None:
+                    if start_ts_offsets[i].offset != -1:
+                        cur_offset = start_ts_offsets[i].offset
+                    else:
+                        cur_offset = max_offset - 1
+                    if end_ts_offsets[i].offset != -1:
+                        end_offset = end_ts_offsets[i].offset
+                    else:
+                        end_offset = max_offset
+
+                elif start_from is not None:
                     if ts_offsets[i].offset != -1:
                         cur_offset = ts_offsets[i].offset
                     else:
                         cur_offset = max_offset - 1
+
+                    end_offset = max_offset
+
                 else:
                     cur_offset = (
                         self._cur_offsets[t_name].get(str(part.partition), -1) + 1
                     )
+                    end_offset = max_offset
 
                 self[t_name][str(part.partition)] = {
                     "cur": cur_offset,
-                    "max": max_offset,
+                    "max": end_offset,
                 }
 
                 parts[i].offset = cur_offset
diff --git a/sources/kafka/sources/kafka/__init__.py b/sources/kafka/sources/kafka/__init__.py