Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions packages/data-designer/src/data_designer/cli/commands/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

from __future__ import annotations

import click
import typer

from data_designer.cli.controllers.generation_controller import GenerationController
from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS
from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS


def create_command(
Expand Down Expand Up @@ -35,6 +37,17 @@ def create_command(
"-o",
help="Path where generated artifacts will be stored. Defaults to ./artifacts.",
),
output_format: str | None = typer.Option(
None,
"--output-format",
"-f",
click_type=click.Choice(list(SUPPORTED_EXPORT_FORMATS)),
help=(
"Export the dataset to a single file after generation. "
"Supported formats: jsonl, csv, parquet. "
"The file is written to <artifact-path>/<dataset-name>/dataset.<format>."
),
),
) -> None:
"""Create a full dataset and save results to disk.

Expand All @@ -60,4 +73,5 @@ def create_command(
num_records=num_records,
dataset_name=dataset_name,
artifact_path=artifact_path,
output_format=output_format,
)
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def run_create(
num_records: int,
dataset_name: str,
artifact_path: str | None,
output_format: str | None = None,
) -> None:
"""Load config, create a full dataset, and save results to disk.

Expand All @@ -124,7 +125,17 @@ def run_create(
num_records: Number of records to generate.
dataset_name: Name for the generated dataset folder.
artifact_path: Path where generated artifacts will be stored, or None for default.
output_format: If set, export the dataset to a single file in this format after
generation. One of 'jsonl', 'csv', 'parquet'.
"""
from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS

if output_format is not None and output_format not in SUPPORTED_EXPORT_FORMATS:
Comment thread
przemekboruta marked this conversation as resolved.
Outdated
print_error(
f"Unsupported export format: {output_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}."
)
raise typer.Exit(code=1)

config_builder = self._load_config(config_source)

resolved_artifact_path = Path(artifact_path) if artifact_path else Path.cwd() / "artifacts"
Expand All @@ -147,16 +158,27 @@ def run_create(
print_error(f"Dataset creation failed: {e}")
raise typer.Exit(code=1)

dataset = results.load_dataset()
num_records = len(results.load_dataset())

analysis = results.load_analysis()
if analysis is not None:
console.print()
analysis.to_report()

console.print()
print_success(f"Dataset created — {len(dataset)} record(s) generated")
console.print(f" Artifacts saved to: [bold]{results.artifact_storage.base_dataset_path}[/bold]")

if output_format is not None:
Comment thread
przemekboruta marked this conversation as resolved.
export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}"
try:
results.export(export_path, format=output_format) # type: ignore[arg-type]
except Exception as e:
print_error(f"Export failed: {e}")
raise typer.Exit(code=1)
console.print(f" Exported to: [bold]{export_path}[/bold]")

console.print()
print_success(f"Dataset created — {num_records} record(s) generated")
console.print()

def _load_config(self, config_source: str) -> DataDesignerConfigBuilder:
Expand Down
42 changes: 41 additions & 1 deletion packages/data-designer/src/data_designer/interface/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal, get_args

from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.dataset_metadata import DatasetMetadata
from data_designer.config.errors import InvalidFileFormatError
from data_designer.config.utils.visualization import WithRecordSamplerMixin
from data_designer.engine.dataset_builders.errors import ArtifactStorageError
from data_designer.engine.storage.artifact_storage import ArtifactStorage
Expand All @@ -19,6 +20,9 @@

from data_designer.engine.dataset_builders.utils.task_model import TaskTrace

ExportFormat = Literal["jsonl", "csv", "parquet"]
SUPPORTED_EXPORT_FORMATS: tuple[str, ...] = get_args(ExportFormat)


class DatasetCreationResults(WithRecordSamplerMixin):
"""Results container for a Data Designer dataset creation run.
Expand Down Expand Up @@ -95,6 +99,42 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path:
raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.")
return self.artifact_storage.processors_outputs_path / processor_name

def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path:
"""Export the generated dataset to a single file.

Args:
path: Output file path. The extension is not inferred from *format* —
the exact path is used as-is.
format: Output format. One of ``'jsonl'``, ``'csv'``, or ``'parquet'``.
Defaults to ``'jsonl'``.

Returns:
Path to the written file.

Raises:
ValueError: If an unsupported format is requested.
Comment thread
przemekboruta marked this conversation as resolved.
Outdated

Example:
>>> results = data_designer.create(config, num_records=1000)
>>> results.export("output.jsonl")
PosixPath('output.jsonl')
>>> results.export("output.csv", format="csv")
PosixPath('output.csv')
Comment thread
przemekboruta marked this conversation as resolved.
"""
if format not in SUPPORTED_EXPORT_FORMATS:
raise InvalidFileFormatError(
f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}."
)
path = Path(path)
df = self.load_dataset()
if format == "jsonl":
df.to_json(path, orient="records", lines=True, force_ascii=False, date_format="iso")
elif format == "csv":
df.to_csv(path, index=False)
elif format == "parquet":
df.to_parquet(path, index=False)
return path

def push_to_hub(
self,
repo_id: str,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non
mock_ctrl = MagicMock()
mock_ctrl_cls.return_value = mock_ctrl

create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None)
create_command(
config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None
)

mock_ctrl_cls.assert_called_once()
mock_ctrl.run_create.assert_called_once_with(
config_source="config.yaml",
num_records=10,
dataset_name="dataset",
artifact_path=None,
output_format=None,
)


Expand All @@ -40,13 +43,15 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None:
num_records=100,
dataset_name="my_data",
artifact_path="/custom/output",
output_format=None,
)

mock_ctrl.run_create.assert_called_once_with(
config_source="config.py",
num_records=100,
dataset_name="my_data",
artifact_path="/custom/output",
output_format=None,
)


Expand All @@ -56,11 +61,37 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock)
mock_ctrl = MagicMock()
mock_ctrl_cls.return_value = mock_ctrl

create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None)
create_command(
config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None
)

mock_ctrl.run_create.assert_called_once_with(
config_source="config.yaml",
num_records=5,
dataset_name="ds",
artifact_path=None,
output_format=None,
)


@patch("data_designer.cli.commands.create.GenerationController")
def test_create_command_passes_output_format(mock_ctrl_cls: MagicMock) -> None:
"""Test create_command forwards --output-format to the controller."""
mock_ctrl = MagicMock()
mock_ctrl_cls.return_value = mock_ctrl

create_command(
config_source="config.yaml",
num_records=10,
dataset_name="dataset",
artifact_path=None,
output_format="jsonl",
)

mock_ctrl.run_create.assert_called_once_with(
config_source="config.yaml",
num_records=10,
dataset_name="dataset",
artifact_path=None,
output_format="jsonl",
)
Original file line number Diff line number Diff line change
Expand Up @@ -772,3 +772,65 @@ def test_run_create_skips_report_when_analysis_is_none(mock_load_config: MagicMo
# load_analysis() returns None, so to_report() must not be called.
# If the code ignores the None check, an AttributeError propagates and the test fails.
mock_results.load_analysis.assert_called_once()


@patch(f"{_CTRL}.DataDesigner")
@patch(f"{_CTRL}.load_config_builder")
def test_run_create_with_output_format_happy_path(mock_load_config: MagicMock, mock_dd_cls: MagicMock) -> None:
"""export() is called with the correct path and format when --output-format is given."""
mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder)
mock_dd = MagicMock()
mock_dd_cls.return_value = mock_dd
mock_results = _make_mock_create_results(5)
mock_dd.create.return_value = mock_results

controller = GenerationController()
controller.run_create(
config_source="config.yaml",
num_records=5,
dataset_name="dataset",
artifact_path=None,
output_format="jsonl",
)

mock_results.export.assert_called_once_with(
Path("/output/artifacts/dataset") / "dataset.jsonl",
format="jsonl",
)


def test_run_create_invalid_output_format_exits() -> None:
"""Bad --output-format exits with code 1 before generation starts."""
controller = GenerationController()
with pytest.raises(typer.Exit) as exc_info:
controller.run_create(
config_source="config.yaml",
num_records=10,
dataset_name="dataset",
artifact_path=None,
output_format="xlsx",
)
assert exc_info.value.exit_code == 1


@patch(f"{_CTRL}.DataDesigner")
@patch(f"{_CTRL}.load_config_builder")
def test_run_create_export_failure_exits(mock_load_config: MagicMock, mock_dd_cls: MagicMock) -> None:
"""If export() raises, run_create exits with code 1."""
mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder)
mock_dd = MagicMock()
mock_dd_cls.return_value = mock_dd
mock_results = _make_mock_create_results(5)
mock_results.export.side_effect = RuntimeError("disk full")
mock_dd.create.return_value = mock_results

controller = GenerationController()
with pytest.raises(typer.Exit) as exc_info:
controller.run_create(
config_source="config.yaml",
num_records=5,
dataset_name="dataset",
artifact_path=None,
output_format="csv",
)
assert exc_info.value.exit_code == 1
1 change: 1 addition & 0 deletions packages/data-designer/tests/cli/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ def test_app_dispatches_lazy_create_command(mock_controller_cls: Mock) -> None:
num_records=DEFAULT_NUM_RECORDS,
dataset_name="dataset",
artifact_path=None,
output_format=None,
)
64 changes: 64 additions & 0 deletions packages/data-designer/tests/interface/test_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from __future__ import annotations

import json
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest
Expand All @@ -11,6 +13,7 @@
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.dataset_metadata import DatasetMetadata
from data_designer.config.errors import InvalidFileFormatError
from data_designer.config.preview_results import PreviewResults
from data_designer.config.utils.errors import DatasetSampleDisplayError
from data_designer.config.utils.visualization import display_sample_record as display_fn
Expand Down Expand Up @@ -259,6 +262,67 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_
stub_artifact_storage.load_dataset.assert_called_once()


@pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"])
def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt):
"""export() writes a file in the requested format."""
out = tmp_path / f"out.{fmt}"
result = stub_dataset_creation_results.export(out, format=fmt)
assert result == out
assert out.exists()
assert out.stat().st_size > 0


def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
"""JSONL export writes one JSON object per line."""
out = tmp_path / "out.jsonl"
stub_dataset_creation_results.export(out, format="jsonl")
lines = out.read_text(encoding="utf-8").splitlines()
assert len(lines) == len(stub_dataframe)
# Each line must be valid JSON
for line in lines:
json.loads(line)


def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
"""CSV export has a header row and one data row per record."""
out = tmp_path / "out.csv"
stub_dataset_creation_results.export(out, format="csv")
loaded = lazy.pd.read_csv(out)
assert list(loaded.columns) == list(stub_dataframe.columns)
assert len(loaded) == len(stub_dataframe)


def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
"""Parquet export round-trips to the original DataFrame."""
out = tmp_path / "out.parquet"
stub_dataset_creation_results.export(out, format="parquet")
loaded = lazy.pd.read_parquet(out)
lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True))


def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path):
"""export() defaults to JSONL when no format is given."""
out = tmp_path / "out.jsonl"
stub_dataset_creation_results.export(out)
lines = out.read_text(encoding="utf-8").splitlines()
# All lines must be valid JSON
for line in lines:
json.loads(line)


def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path):
"""export() raises InvalidFileFormatError for unknown formats."""
with pytest.raises(InvalidFileFormatError, match="Unsupported export format"):
stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type]


def test_export_returns_path_object(stub_dataset_creation_results, tmp_path):
"""export() returns a Path regardless of whether str or Path was passed."""
out = tmp_path / "out.jsonl"
result = stub_dataset_creation_results.export(str(out))
assert isinstance(result, Path)


def test_preview_results_dataset_metadata() -> None:
"""Test that PreviewResults uses DatasetMetadata in display_sample_record."""
config_builder = MagicMock(spec=DataDesignerConfigBuilder)
Expand Down
Loading