airbytehq
diff --git a/‎airbyte_cdk/sources/declarative/async_job/job.py
Lines changed: 6 additions & 0 deletions b/‎airbyte_cdk/sources/declarative/async_job/job.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎airbyte_cdk/sources/declarative/async_job/job_orchestrator.py
Lines changed: 41 additions & 13 deletions b/‎airbyte_cdk/sources/declarative/async_job/job_orchestrator.py
Lines changed: 41 additions & 13 deletions
diff --git a/‎airbyte_cdk/sources/declarative/async_job/job_tracker.py
Lines changed: 22 additions & 6 deletions b/‎airbyte_cdk/sources/declarative/async_job/job_tracker.py
Lines changed: 22 additions & 6 deletions
diff --git a/‎airbyte_cdk/sources/declarative/concurrent_declarative_source.py
Lines changed: 5 additions & 1 deletion b/‎airbyte_cdk/sources/declarative/concurrent_declarative_source.py
Lines changed: 5 additions & 1 deletion
@@ -34,6 +34,12 @@ def api_job_id(self) -> str:
 
     def status(self) -> AsyncJobStatus:
         if self._timer.has_timed_out():
+            # TODO: we should account the fact that,
+            # certain APIs could send the `Timeout` status,
+            # thus we should not return `Timeout` in that case,
+            # but act based on the scenario.
+
+            # the default behavior is to return `Timeout` status and retry.
             return AsyncJobStatus.TIMED_OUT
         return self._status
 
 
@@ -44,16 +44,21 @@ class AsyncPartition:
     This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs
     """
 
-    _MAX_NUMBER_OF_ATTEMPTS = 3
+    _DEFAULT_MAX_JOB_RETRY = 3
 
-    def __init__(self, jobs: List[AsyncJob], stream_slice: StreamSlice) -> None:
+    def __init__(
+        self, jobs: List[AsyncJob], stream_slice: StreamSlice, job_max_retry: Optional[int] = None
+    ) -> None:
         self._attempts_per_job = {job: 1 for job in jobs}
         self._stream_slice = stream_slice
+        self._job_max_retry = (
+            job_max_retry if job_max_retry is not None else self._DEFAULT_MAX_JOB_RETRY
+        )
 
     def has_reached_max_attempt(self) -> bool:
         return any(
             map(
-                lambda attempt_count: attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS,
+                lambda attempt_count: attempt_count >= self._job_max_retry,
                 self._attempts_per_job.values(),
             )
         )
@@ -62,7 +67,7 @@ def replace_job(self, job_to_replace: AsyncJob, new_jobs: List[AsyncJob]) -> Non
         current_attempt_count = self._attempts_per_job.pop(job_to_replace, None)
         if current_attempt_count is None:
             raise ValueError("Could not find job to replace")
-        elif current_attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS:
+        elif current_attempt_count >= self._job_max_retry:
             raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}")
 
         new_attempt_count = current_attempt_count + 1
@@ -155,6 +160,7 @@ def __init__(
         message_repository: MessageRepository,
         exceptions_to_break_on: Iterable[Type[Exception]] = tuple(),
         has_bulk_parent: bool = False,
+        job_max_retry: Optional[int] = None,
     ) -> None:
         """
         If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent`
@@ -175,6 +181,7 @@ def __init__(
         self._message_repository = message_repository
         self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on)
         self._has_bulk_parent = has_bulk_parent
+        self._job_max_retry = job_max_retry
 
         self._non_breaking_exceptions: List[Exception] = []
 
@@ -214,7 +221,7 @@ def _start_jobs(self) -> None:
             for _slice in self._slice_iterator:
                 at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True
                 job = self._start_job(_slice)
-                self._running_partitions.append(AsyncPartition([job], _slice))
+                self._running_partitions.append(AsyncPartition([job], _slice, self._job_max_retry))
                 if self._has_bulk_parent and self._slice_iterator.has_next():
                     break
         except ConcurrentJobLimitReached:
@@ -359,14 +366,11 @@ def _process_running_partitions_and_yield_completed_ones(
                     self._process_partitions_with_errors(partition)
                 case _:
                     self._stop_timed_out_jobs(partition)
+                    # re-allocate FAILED jobs, but TIMEOUT jobs are not re-allocated
+                    self._reallocate_partition(current_running_partitions, partition)
 
-                    # job will be restarted in `_start_job`
-                    current_running_partitions.insert(0, partition)
-
-            for job in partition.jobs:
-                # We only remove completed jobs as we want failed/timed out jobs to be re-allocated in priority
-                if job.status() == AsyncJobStatus.COMPLETED:
-                    self._job_tracker.remove_job(job.api_job_id())
+            # We only remove completed / timeout jobs jobs as we want failed jobs to be re-allocated in priority
+            self._remove_completed_jobs(partition)
 
         # update the referenced list with running partitions
         self._running_partitions = current_running_partitions
@@ -381,7 +385,6 @@ def _stop_partition(self, partition: AsyncPartition) -> None:
     def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None:
         for job in partition.jobs:
             if job.status() == AsyncJobStatus.TIMED_OUT:
-                # we don't free allocation here because it is expected to retry the job
                 self._abort_job(job, free_job_allocation=False)
 
     def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
@@ -392,6 +395,31 @@ def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
         except Exception as exception:
             LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}")
 
+    def _remove_completed_jobs(self, partition: AsyncPartition) -> None:
+        """
+        Remove completed or timed out jobs from the partition.
+
+        Args:
+            partition (AsyncPartition): The partition to process.
+        """
+        for job in partition.jobs:
+            if job.status() == AsyncJobStatus.COMPLETED:
+                self._job_tracker.remove_job(job.api_job_id())
+
+    def _reallocate_partition(
+        self,
+        current_running_partitions: List[AsyncPartition],
+        partition: AsyncPartition,
+    ) -> None:
+        """
+        Reallocate the partition by starting a new job for each job in the
+        partition.
+        Args:
+            current_running_partitions (list): The list of currently running partitions.
+            partition (AsyncPartition): The partition to reallocate.
+        """
+        current_running_partitions.insert(0, partition)
+
     def _process_partitions_with_errors(self, partition: AsyncPartition) -> None:
         """
         Process a partition with status errors (FAILED and TIMEOUT).
 
@@ -3,9 +3,11 @@
 import logging
 import threading
 import uuid
-from typing import Set
+from dataclasses import dataclass, field
+from typing import Any, Mapping, Set, Union
 
 from airbyte_cdk.logger import lazy_log
+from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
 
 LOGGER = logging.getLogger("airbyte")
 
@@ -14,15 +16,29 @@ class ConcurrentJobLimitReached(Exception):
     pass
 
 
+@dataclass
 class JobTracker:
-    def __init__(self, limit: int):
+    limit: Union[int, str]
+    config: Mapping[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
         self._jobs: Set[str] = set()
-        if limit < 1:
+        self._lock = threading.Lock()
+        if isinstance(self.limit, str):
+            try:
+                self.limit = int(
+                    InterpolatedString(self.limit, parameters={}).eval(config=self.config)
+                )
+            except Exception as e:
+                LOGGER.warning(
+                    f"Error interpolating max job count: {self.limit}. Setting to 1. {e}"
+                )
+                self.limit = 1
+        if self.limit < 1:
             LOGGER.warning(
-                f"The `max_concurrent_async_job_count` property is less than 1: {limit}. Setting to 1. Please update the source manifest to set a valid value."
+                f"The `max_concurrent_async_job_count` property is less than 1: {self.limit}. Setting to 1. Please update the source manifest to set a valid value."
             )
-        self._limit = 1 if limit < 1 else limit
-        self._lock = threading.Lock()
+        self._limit = self.limit if self.limit >= 1 else 1
 
     def try_to_get_intent(self) -> str:
         lazy_log(
 
@@ -206,7 +206,11 @@ def _group_streams(
             # these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
             # so we need to treat them as synchronous
 
-            if name_to_stream_mapping[declarative_stream.name]["type"] == "StateDelegatingStream":
+            if (
+                isinstance(declarative_stream, DeclarativeStream)
+                and name_to_stream_mapping[declarative_stream.name]["type"]
+                == "StateDelegatingStream"
+            ):
                 stream_state = self._connector_state_manager.get_stream_state(
                     stream_name=declarative_stream.name, namespace=declarative_stream.namespace
                 )