5
5
import copy
6
6
import logging
7
7
import threading
8
+ import time
8
9
from collections import OrderedDict
9
10
from copy import deepcopy
10
11
from datetime import timedelta
@@ -58,7 +59,8 @@ class ConcurrentPerPartitionCursor(Cursor):
58
59
CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}.
59
60
"""
60
61
61
- DEFAULT_MAX_PARTITIONS_NUMBER = 10000
62
+ DEFAULT_MAX_PARTITIONS_NUMBER = 25_000
63
+ SWITCH_TO_GLOBAL_LIMIT = 10_000
62
64
_NO_STATE : Mapping [str , Any ] = {}
63
65
_NO_CURSOR_STATE : Mapping [str , Any ] = {}
64
66
_GLOBAL_STATE_KEY = "state"
@@ -99,9 +101,11 @@ def __init__(
99
101
self ._new_global_cursor : Optional [StreamState ] = None
100
102
self ._lookback_window : int = 0
101
103
self ._parent_state : Optional [StreamState ] = None
102
- self ._over_limit : int = 0
104
+ self ._number_of_partitions : int = 0
103
105
self ._use_global_cursor : bool = False
104
106
self ._partition_serializer = PerPartitionKeySerializer ()
107
+ # Track the last time a state message was emitted
108
+ self ._last_emission_time : float = 0.0
105
109
106
110
self ._set_initial_state (stream_state )
107
111
@@ -141,21 +145,16 @@ def close_partition(self, partition: Partition) -> None:
141
145
raise ValueError ("stream_slice cannot be None" )
142
146
143
147
partition_key = self ._to_partition_key (stream_slice .partition )
144
- self ._cursor_per_partition [partition_key ].close_partition (partition = partition )
145
148
with self ._lock :
146
149
self ._semaphore_per_partition [partition_key ].acquire ()
147
- cursor = self ._cursor_per_partition [partition_key ]
148
- if (
149
- partition_key in self ._finished_partitions
150
- and self ._semaphore_per_partition [partition_key ]._value == 0
151
- ):
150
+ if not self ._use_global_cursor :
151
+ self ._cursor_per_partition [partition_key ].close_partition (partition = partition )
152
+ cursor = self ._cursor_per_partition [partition_key ]
152
153
if (
153
- self ._new_global_cursor is None
154
- or self ._new_global_cursor [self .cursor_field .cursor_field_key ]
155
- < cursor .state [self .cursor_field .cursor_field_key ]
154
+ partition_key in self ._finished_partitions
155
+ and self ._semaphore_per_partition [partition_key ]._value == 0
156
156
):
157
- self ._new_global_cursor = copy .deepcopy (cursor .state )
158
- if not self ._use_global_cursor :
157
+ self ._update_global_cursor (cursor .state [self .cursor_field .cursor_field_key ])
159
158
self ._emit_state_message ()
160
159
161
160
def ensure_at_least_one_state_emitted (self ) -> None :
@@ -169,9 +168,23 @@ def ensure_at_least_one_state_emitted(self) -> None:
169
168
self ._global_cursor = self ._new_global_cursor
170
169
self ._lookback_window = self ._timer .finish ()
171
170
self ._parent_state = self ._partition_router .get_stream_state ()
172
- self ._emit_state_message ()
171
+ self ._emit_state_message (throttle = False )
173
172
174
- def _emit_state_message (self ) -> None :
173
+ def _throttle_state_message (self ) -> Optional [float ]:
174
+ """
175
+ Throttles the state message emission to once every 60 seconds.
176
+ """
177
+ current_time = time .time ()
178
+ if current_time - self ._last_emission_time <= 60 :
179
+ return None
180
+ return current_time
181
+
182
+ def _emit_state_message (self , throttle : bool = True ) -> None :
183
+ if throttle :
184
+ current_time = self ._throttle_state_message ()
185
+ if current_time is None :
186
+ return
187
+ self ._last_emission_time = current_time
175
188
self ._connector_state_manager .update_state_for_stream (
176
189
self ._stream_name ,
177
190
self ._stream_namespace ,
@@ -202,6 +215,7 @@ def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[St
202
215
self ._lookback_window if self ._global_cursor else 0 ,
203
216
)
204
217
with self ._lock :
218
+ self ._number_of_partitions += 1
205
219
self ._cursor_per_partition [self ._to_partition_key (partition .partition )] = cursor
206
220
self ._semaphore_per_partition [self ._to_partition_key (partition .partition )] = (
207
221
threading .Semaphore (0 )
@@ -232,9 +246,15 @@ def _ensure_partition_limit(self) -> None:
232
246
- Logs a warning each time a partition is removed, indicating whether it was finished
233
247
or removed due to being the oldest.
234
248
"""
249
+ if not self ._use_global_cursor and self .limit_reached ():
250
+ logger .info (
251
+ f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of { self .SWITCH_TO_GLOBAL_LIMIT } . "
252
+ f"Switching to global cursor for { self ._stream_name } ."
253
+ )
254
+ self ._use_global_cursor = True
255
+
235
256
with self ._lock :
236
257
while len (self ._cursor_per_partition ) > self .DEFAULT_MAX_PARTITIONS_NUMBER - 1 :
237
- self ._over_limit += 1
238
258
# Try removing finished partitions first
239
259
for partition_key in list (self ._cursor_per_partition .keys ()):
240
260
if (
@@ -245,7 +265,7 @@ def _ensure_partition_limit(self) -> None:
245
265
partition_key
246
266
) # Remove the oldest partition
247
267
logger .warning (
248
- f"The maximum number of partitions has been reached. Dropping the oldest finished partition: { oldest_partition } . Over limit: { self ._over_limit } ."
268
+ f"The maximum number of partitions has been reached. Dropping the oldest finished partition: { oldest_partition } . Over limit: { self ._number_of_partitions - self . DEFAULT_MAX_PARTITIONS_NUMBER } ."
249
269
)
250
270
break
251
271
else :
@@ -254,7 +274,7 @@ def _ensure_partition_limit(self) -> None:
254
274
1
255
275
] # Remove the oldest partition
256
276
logger .warning (
257
- f"The maximum number of partitions has been reached. Dropping the oldest partition: { oldest_partition } . Over limit: { self ._over_limit } ."
277
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: { oldest_partition } . Over limit: { self ._number_of_partitions - self . DEFAULT_MAX_PARTITIONS_NUMBER } ."
258
278
)
259
279
260
280
def _set_initial_state (self , stream_state : StreamState ) -> None :
@@ -314,6 +334,7 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
314
334
self ._lookback_window = int (stream_state .get ("lookback_window" , 0 ))
315
335
316
336
for state in stream_state .get (self ._PERPARTITION_STATE_KEY , []):
337
+ self ._number_of_partitions += 1
317
338
self ._cursor_per_partition [self ._to_partition_key (state ["partition" ])] = (
318
339
self ._create_cursor (state ["cursor" ])
319
340
)
@@ -354,16 +375,26 @@ def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
354
375
self ._new_global_cursor = deepcopy (fixed_global_state )
355
376
356
377
def observe (self , record : Record ) -> None :
357
- if not self ._use_global_cursor and self .limit_reached ():
358
- self ._use_global_cursor = True
359
-
360
378
if not record .associated_slice :
361
379
raise ValueError (
362
380
"Invalid state as stream slices that are emitted should refer to an existing cursor"
363
381
)
364
- self ._cursor_per_partition [
365
- self ._to_partition_key (record .associated_slice .partition )
366
- ].observe (record )
382
+
383
+ record_cursor = self ._connector_state_converter .output_format (
384
+ self ._connector_state_converter .parse_value (self ._cursor_field .extract_value (record ))
385
+ )
386
+ self ._update_global_cursor (record_cursor )
387
+ if not self ._use_global_cursor :
388
+ self ._cursor_per_partition [
389
+ self ._to_partition_key (record .associated_slice .partition )
390
+ ].observe (record )
391
+
392
+ def _update_global_cursor (self , value : Any ) -> None :
393
+ if (
394
+ self ._new_global_cursor is None
395
+ or self ._new_global_cursor [self .cursor_field .cursor_field_key ] < value
396
+ ):
397
+ self ._new_global_cursor = {self .cursor_field .cursor_field_key : copy .deepcopy (value )}
367
398
368
399
def _to_partition_key (self , partition : Mapping [str , Any ]) -> str :
369
400
return self ._partition_serializer .to_partition_key (partition )
@@ -397,4 +428,4 @@ def _get_cursor(self, record: Record) -> ConcurrentCursor:
397
428
return cursor
398
429
399
430
def limit_reached (self ) -> bool :
400
- return self ._over_limit > self .DEFAULT_MAX_PARTITIONS_NUMBER
431
+ return self ._number_of_partitions > self .SWITCH_TO_GLOBAL_LIMIT
0 commit comments