1
1
from typing import Any , Dict , List
2
2
3
3
from confluent_kafka import Consumer , Message , TopicPartition # type: ignore
4
- from confluent_kafka .admin import AdminClient , TopicMetadata # type: ignore
4
+ from confluent_kafka .admin import TopicMetadata # type: ignore
5
5
6
6
from dlt import config , secrets
7
7
from dlt .common import pendulum
@@ -54,23 +54,26 @@ def default_msg_processor(msg: Message) -> Dict[str, Any]:
54
54
class OffsetTracker (dict ): # type: ignore
55
55
"""Object to control offsets of the given topics.
56
56
57
- Tracks all the partitions of the given topics with two params:
58
- current offset and maximum offset (partition length).
57
+ Tracks all the partitions of the given topics with three params:
58
+ current offset, maximum offset (partition length), and an end time .
59
59
60
60
Args:
61
61
consumer (confluent_kafka.Consumer): Kafka consumer.
62
62
topic_names (List): Names of topics to track.
63
63
pl_state (DictStrAny): Pipeline current state.
64
64
start_from (Optional[pendulum.DateTime]): A timestamp, after which messages
65
65
are read. Older messages are ignored.
66
+ end_time (Optional[pendulum.DateTime]): A timestamp, before which messages
67
+ are read. Newer messages are ignored.
66
68
"""
67
69
68
70
def __init__ (
69
71
self ,
70
72
consumer : Consumer ,
71
73
topic_names : List [str ],
72
74
pl_state : DictStrAny ,
73
- start_from : pendulum .DateTime = None ,
75
+ start_from : Optional [pendulum .DateTime ] = None ,
76
+ end_time : Optional [pendulum .DateTime ] = None ,
74
77
):
75
78
super ().__init__ ()
76
79
@@ -82,7 +85,7 @@ def __init__(
82
85
"offsets" , {t_name : {} for t_name in topic_names }
83
86
)
84
87
85
- self ._init_partition_offsets (start_from )
88
+ self ._init_partition_offsets (start_from , end_time )
86
89
87
90
def _read_topics (self , topic_names : List [str ]) -> Dict [str , TopicMetadata ]:
88
91
"""Read the given topics metadata from Kafka.
@@ -104,7 +107,11 @@ def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
104
107
105
108
return tracked_topics
106
109
107
- def _init_partition_offsets (self , start_from : pendulum .DateTime ) -> None :
110
+ def _init_partition_offsets (
111
+ self ,
112
+ start_from : Optional [pendulum .DateTime ] = None ,
113
+ end_time : Optional [pendulum .DateTime ] = None ,
114
+ ) -> None :
108
115
"""Designate current and maximum offsets for every partition.
109
116
110
117
Current offsets are read from the state, if present. Set equal
@@ -113,6 +120,8 @@ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
113
120
Args:
114
121
start_from (pendulum.DateTime): A timestamp, at which to start
115
122
reading. Older messages are ignored.
123
+ end_time (pendulum.DateTime): A timestamp, before which messages
124
+ are read. Newer messages are ignored.
116
125
"""
117
126
all_parts = []
118
127
for t_name , topic in self ._topics .items ():
@@ -128,27 +137,49 @@ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
128
137
for part in topic .partitions
129
138
]
130
139
131
- # get offsets for the timestamp, if given
132
- if start_from is not None :
140
+ # get offsets for the timestamp ranges, if given
141
+ if start_from is not None and end_time is not None :
142
+ start_ts_offsets = self ._consumer .offsets_for_times (parts )
143
+ end_ts_offsets = self ._consumer .offsets_for_times (
144
+ [
145
+ TopicPartition (t_name , part , end_time .int_timestamp * 1000 )
146
+ for part in topic .partitions
147
+ ]
148
+ )
149
+ elif start_from is not None :
133
150
ts_offsets = self ._consumer .offsets_for_times (parts )
134
151
135
152
# designate current and maximum offsets for every partition
136
153
for i , part in enumerate (parts ):
137
154
max_offset = self ._consumer .get_watermark_offsets (part )[1 ]
138
155
139
- if start_from is not None :
156
+ if start_from is not None and end_time is not None :
157
+ if start_ts_offsets [i ].offset != - 1 :
158
+ cur_offset = start_ts_offsets [i ].offset
159
+ else :
160
+ cur_offset = max_offset - 1
161
+ if end_ts_offsets [i ].offset != - 1 :
162
+ end_offset = end_ts_offsets [i ].offset
163
+ else :
164
+ end_offset = max_offset
165
+
166
+ elif start_from is not None :
140
167
if ts_offsets [i ].offset != - 1 :
141
168
cur_offset = ts_offsets [i ].offset
142
169
else :
143
170
cur_offset = max_offset - 1
171
+
172
+ end_offset = max_offset
173
+
144
174
else :
145
175
cur_offset = (
146
176
self ._cur_offsets [t_name ].get (str (part .partition ), - 1 ) + 1
147
177
)
178
+ end_offset = max_offset
148
179
149
180
self [t_name ][str (part .partition )] = {
150
181
"cur" : cur_offset ,
151
- "max" : max_offset ,
182
+ "max" : end_offset ,
152
183
}
153
184
154
185
parts [i ].offset = cur_offset
0 commit comments