NVIDIA-NeMo · przemekboruta · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
@@ -158,7 +158,14 @@ def reset(self, delete_files: bool = False) -> None:
                     except OSError as e:
                         raise DatasetBatchManagementError(f"🛑 Failed to delete directory {dir_path}: {e}")
 
-    def start(self, *, num_records: int, buffer_size: int) -> None:
+    def start(
+        self,
+        *,
+        num_records: int,
+        buffer_size: int,
+        start_batch: int = 0,
+        initial_actual_num_records: int = 0,
+    ) -> None:
         if num_records <= 0:
             raise DatasetBatchManagementError("🛑 num_records must be positive.")
         if buffer_size <= 0:
@@ -169,6 +176,8 @@ def start(self, *, num_records: int, buffer_size: int) -> None:
         if remaining_records := num_records % buffer_size:
             self._num_records_list.append(remaining_records)
         self.reset()
+        self._current_batch_number = start_batch
+        self._actual_num_records = initial_actual_num_records
 
     def write(self) -> Path | None:
         """Write the current batch to a parquet file.

@@ -28,13 +28,18 @@ class RowGroupBufferManager:
     exclusively by the async scheduler.
     """
 
-    def __init__(self, artifact_storage: ArtifactStorage) -> None:
+    def __init__(
+        self,
+        artifact_storage: ArtifactStorage,
+        initial_actual_num_records: int = 0,
+        initial_total_num_batches: int = 0,
+    ) -> None:
         self._buffers: dict[int, list[dict]] = {}
         self._row_group_sizes: dict[int, int] = {}
         self._dropped: dict[int, set[int]] = {}
         self._artifact_storage = artifact_storage
-        self._actual_num_records: int = 0
-        self._total_num_batches: int = 0
+        self._actual_num_records: int = initial_actual_num_records
+        self._total_num_batches: int = initial_total_num_batches
 
     def init_row_group(self, row_group: int, size: int) -> None:
         """Allocate a buffer for *row_group* with *size* empty rows."""

@@ -38,6 +38,12 @@ class BatchStage(StrEnum):
     PROCESSORS_OUTPUTS = "processors_outputs_path"
 
 
+class ResumeMode(StrEnum):
+    NEVER = "never"
+    ALWAYS = "always"
+    IF_POSSIBLE = "if_possible"
+
+
 class ArtifactStorage(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -47,6 +53,7 @@ class ArtifactStorage(BaseModel):
     partial_results_folder_name: str = "tmp-partial-parquet-files"
     dropped_columns_folder_name: str = "dropped-columns-parquet-files"
     processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME
+    resume: ResumeMode = ResumeMode.NEVER
     _media_storage: MediaStorage = PrivateAttr(default=None)
 
     @property
@@ -67,12 +74,19 @@ def artifact_path_exists(self) -> bool:
     def resolved_dataset_name(self) -> str:
         dataset_path = self.artifact_path / self.dataset_name
         if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0:
+            if self.resume in (ResumeMode.ALWAYS, ResumeMode.IF_POSSIBLE):
+                return self.dataset_name
             new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime('%m-%d-%Y_%H%M%S')}"
             logger.info(
                 f"📂 Dataset path {str(dataset_path)!r} already exists. Dataset from this session"
                 f"\n\t\t     will be saved to {str(self.artifact_path / new_dataset_name)!r} instead."
             )
             return new_dataset_name
+        if self.resume == ResumeMode.ALWAYS:
+            raise ArtifactStorageError(
+                f"🛑 Cannot resume: no existing dataset found at {str(dataset_path)!r}. "
+                "Run without resume=ResumeMode.ALWAYS to start a new generation."
+            )
         return self.dataset_name
 
     @property
@@ -204,6 +218,11 @@ def load_dataset_with_dropped_columns(self) -> pd.DataFrame:
             df = lazy.pd.concat([df, df_dropped], axis=1)
         return df
 
+    def clear_partial_results(self) -> None:
+        """Remove any in-flight partial results left over from an interrupted run."""
+        if self.partial_results_path.exists():
+            shutil.rmtree(self.partial_results_path)
+
     def move_partial_result_to_final_file_path(self, batch_number: int) -> Path:
         partial_result_path = self.create_batch_file_path(batch_number, batch_stage=BatchStage.PARTIAL_RESULT)
         if not partial_result_path.exists():