Skip to content

Commit 9f3c9bd

Browse files
authored
fix: timestamp dataset name if there's a collision (#77)
* timestampe dataset name if there's a collision
1 parent 5399904 commit 9f3c9bd

3 files changed

Lines changed: 41 additions & 2 deletions

File tree

src/data_designer/engine/dataset_builders/artifact_storage.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
from datetime import datetime
5+
from functools import cached_property
46
import json
57
import logging
68
from pathlib import Path
@@ -36,9 +38,21 @@ class ArtifactStorage(BaseModel):
3638
def artifact_path_exists(self) -> bool:
3739
return self.artifact_path.exists()
3840

41+
@cached_property
42+
def resolved_dataset_name(self) -> str:
43+
dataset_path = self.artifact_path / self.dataset_name
44+
if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0:
45+
new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime('%m-%d-%Y_%H%M%S')}"
46+
logger.info(
47+
f"📂 Dataset path {str(dataset_path)!r} already exists. Dataset from this session"
48+
f"\n\t\t will be saved to {str(self.artifact_path / new_dataset_name)!r} instead."
49+
)
50+
return new_dataset_name
51+
return self.dataset_name
52+
3953
@property
4054
def base_dataset_path(self) -> Path:
41-
return self.artifact_path / self.dataset_name
55+
return self.artifact_path / self.resolved_dataset_name
4256

4357
@property
4458
def dropped_columns_dataset_path(self) -> Path:

src/data_designer/interface/data_designer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,11 @@ def create(
173173
configuration (columns, constraints, seed data, etc.).
174174
num_records: Number of records to generate.
175175
dataset_name: Name of the dataset. This name will be used as the dataset
176-
folder name in the artifact path directory.
176+
folder name in the artifact path directory. If a non-empty directory with the
177+
same name already exists, dataset will be saved to a new directory with
178+
a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory
179+
with the same name already exists, the dataset will be saved to a new directory
180+
with the name "awesome_dataset_2025-01-01_12-00-00".
177181
178182
Returns:
179183
DatasetCreationResults object with methods for loading the generated dataset,

tests/engine/dataset_builders/test_artifact_storage.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
from datetime import datetime
45
import json
6+
from unittest.mock import patch
57

68
import pandas as pd
79
from pyarrow import ArrowNotImplementedError
@@ -213,3 +215,22 @@ def test_artifact_storage_batch_numbering(stub_artifact_storage, batch_number):
213215
path = stub_artifact_storage.create_batch_file_path(batch_number, BatchStage.FINAL_RESULT)
214216
expected_name = f"batch_{batch_number:05d}.parquet"
215217
assert path.name == expected_name
218+
219+
220+
@patch("data_designer.engine.dataset_builders.artifact_storage.datetime")
221+
def test_artifact_storage_resolved_dataset_name(mock_datetime, tmp_path):
222+
mock_datetime.now.return_value = datetime(2025, 1, 1, 12, 3, 4)
223+
224+
# dataset path does not exist yet
225+
assert ArtifactStorage(artifact_path=tmp_path).resolved_dataset_name == "dataset"
226+
227+
# dataset path exists but is empty
228+
af_storage = ArtifactStorage(artifact_path=tmp_path)
229+
(af_storage.artifact_path / af_storage.dataset_name).mkdir()
230+
assert af_storage.resolved_dataset_name == "dataset"
231+
232+
# dataset path exists and is not empty
233+
af_storage = ArtifactStorage(artifact_path=tmp_path)
234+
(af_storage.artifact_path / af_storage.dataset_name / "stub_file.txt").touch()
235+
print(af_storage.resolved_dataset_name)
236+
assert af_storage.resolved_dataset_name == "dataset_01-01-2025_120304"

0 commit comments

Comments
 (0)