2828from data_designer .config .utils .type_helpers import StrEnum
2929from data_designer .config .utils .warning_helpers import warn_at_caller
3030from data_designer .config .version import get_library_version
31+ from data_designer .engine import flags
3132from data_designer .engine .column_generators .generators .base import (
3233 ColumnGenerator ,
3334 ColumnGeneratorWithModel ,
3435 GenerationStrategy ,
3536)
36- from data_designer .engine .column_generators .utils .generator_classification import column_type_is_model_generated
3737from data_designer .engine .compiler import compile_data_designer_config
3838from data_designer .engine .context import current_row_group , current_row_group_start_offset
3939from data_designer .engine .dataset_builders .errors import DatasetGenerationError
5959 strip_skip_metadata_from_records ,
6060)
6161from data_designer .engine .dataset_builders .utils .sticky_progress_bar import StickyProgressBar
62+ from data_designer .engine .models .clients .adapters .http_model_client import ClientConcurrencyMode
6263from data_designer .engine .models .telemetry import InferenceEvent , NemoSourceEnum , TaskStatusEnum , TelemetryHandler
6364from data_designer .engine .processing .processors .base import Processor
6465from data_designer .engine .processing .processors .drop_columns import DropColumnsProcessor
66+ from data_designer .engine .readiness import run_readiness_check
6567from data_designer .engine .registry .data_designer_registry import DataDesignerRegistry
6668from data_designer .engine .resources .resource_provider import ResourceProvider
6769from data_designer .engine .storage .artifact_storage import (
8284
8385logger = logging .getLogger (__name__ )
8486
85- # Async engine is the default execution path. Set ``DATA_DESIGNER_ASYNC_ENGINE=0``
86- # to opt back into the legacy sync engine for one transitional release; the sync
87- # path is scheduled for removal afterwards.
88- DATA_DESIGNER_ASYNC_ENGINE = os . environ . get ( "DATA_DESIGNER_ASYNC_ENGINE" , "1" ) == "1"
87+ # The async- engine flag now lives in ``data_designer.engine.flags`` so the
88+ # engine, the public interface, and the readiness module can share one source
89+ # of truth. Always read ``flags.DATA_DESIGNER_ASYNC_ENGINE`` rather than caching
90+ # a local copy so monkeypatches in tests are visible.
8991
90- if DATA_DESIGNER_ASYNC_ENGINE :
92+ if flags . DATA_DESIGNER_ASYNC_ENGINE :
9193 import asyncio
9294
9395 from data_designer .engine .dataset_builders .async_scheduler import (
@@ -193,7 +195,7 @@ def __init__(
193195 self ._task_traces : list [TaskTrace ] = []
194196 self ._registry = registry or DataDesignerRegistry ()
195197 self ._graph : ExecutionGraph | None = None
196- self ._use_async : bool = DATA_DESIGNER_ASYNC_ENGINE
198+ self ._use_async : bool = flags . DATA_DESIGNER_ASYNC_ENGINE
197199 # Structured signal: set by _build_async if the scheduler hit early shutdown.
198200 # Stays at defaults for sync-engine and successful async runs. Reset at
199201 # the start of each public run path so reused builder instances don't
@@ -275,10 +277,6 @@ def single_column_configs(self) -> list[ColumnConfigT]:
275277 def single_column_config_by_name (self ) -> dict [str , ColumnConfigT ]:
276278 return {config .name : config for config in self .single_column_configs }
277279
278- @functools .cached_property
279- def llm_generated_column_configs (self ) -> list [ColumnConfigT ]:
280- return [config for config in self .single_column_configs if column_type_is_model_generated (config .column_type )]
281-
282280 def build (
283281 self ,
284282 * ,
@@ -314,9 +312,13 @@ def build(
314312 Path to the generated dataset directory.
315313 """
316314 self ._reset_run_state ()
315+ self ._use_async = flags .DATA_DESIGNER_ASYNC_ENGINE and self ._resolve_async_compatibility ()
317316
318- self ._run_model_health_check_if_needed ()
319- self ._run_mcp_tool_check_if_needed ()
317+ run_readiness_check (
318+ self .single_column_configs ,
319+ self ._resource_provider ,
320+ client_concurrency_mode = ClientConcurrencyMode .ASYNC if self ._use_async else ClientConcurrencyMode .SYNC ,
321+ )
320322
321323 # For IF_POSSIBLE and ALWAYS: check config compatibility before touching the artifact
322324 # directory. _check_resume_config_compatibility() must NOT access base_dataset_path
@@ -386,7 +388,6 @@ def build(
386388 "start a new generation run."
387389 )
388390
389- self ._use_async = DATA_DESIGNER_ASYNC_ENGINE and self ._resolve_async_compatibility ()
390391 if self ._use_async :
391392 self ._build_async (generators , num_records , buffer_size , on_batch_complete , resume = resume )
392393 elif resume == ResumeMode .ALWAYS :
@@ -657,8 +658,12 @@ def _build_with_resume(
657658
658659 def build_preview (self , * , num_records : int ) -> pd .DataFrame :
659660 self ._reset_run_state ()
660- self ._run_model_health_check_if_needed ()
661- self ._run_mcp_tool_check_if_needed ()
661+ self ._use_async = flags .DATA_DESIGNER_ASYNC_ENGINE and self ._resolve_async_compatibility ()
662+ run_readiness_check (
663+ self .single_column_configs ,
664+ self ._resource_provider ,
665+ client_concurrency_mode = ClientConcurrencyMode .ASYNC if self ._use_async else ClientConcurrencyMode .SYNC ,
666+ )
662667
663668 # Set media storage to DATAFRAME mode for preview - base64 stored directly in DataFrame
664669 if self ._has_image_columns ():
@@ -667,7 +672,6 @@ def build_preview(self, *, num_records: int) -> pd.DataFrame:
667672 generators , self ._graph = self ._initialize_generators_and_graph ()
668673 start_time = time .perf_counter ()
669674
670- self ._use_async = DATA_DESIGNER_ASYNC_ENGINE and self ._resolve_async_compatibility ()
671675 if self ._use_async :
672676 dataset = self ._build_async_preview (generators , num_records )
673677 else :
@@ -1407,38 +1411,6 @@ def _merge_skipped_and_generated(
14071411 batch .append (gen_result )
14081412 return batch
14091413
1410- def _run_model_health_check_if_needed (self ) -> None :
1411- model_aliases : set [str ] = set ()
1412- for config in self .single_column_configs :
1413- model_aliases .update (config .get_model_aliases ())
1414-
1415- if not model_aliases :
1416- return
1417-
1418- if DATA_DESIGNER_ASYNC_ENGINE :
1419- loop = ensure_async_engine_loop ()
1420- future = asyncio .run_coroutine_threadsafe (
1421- self ._resource_provider .model_registry .arun_health_check (list (model_aliases )),
1422- loop ,
1423- )
1424- try :
1425- future .result (timeout = 180 )
1426- except TimeoutError :
1427- future .cancel ()
1428- raise
1429- else :
1430- self ._resource_provider .model_registry .run_health_check (list (model_aliases ))
1431-
1432- def _run_mcp_tool_check_if_needed (self ) -> None :
1433- tool_aliases = sorted (
1434- {config .tool_alias for config in self .llm_generated_column_configs if getattr (config , "tool_alias" , None )}
1435- )
1436- if not tool_aliases :
1437- return
1438- if self ._resource_provider .mcp_registry is None :
1439- raise DatasetGenerationError (f"Tool alias(es) { tool_aliases !r} specified but no MCPRegistry configured." )
1440- self ._resource_provider .mcp_registry .run_health_check (tool_aliases )
1441-
14421414 def _setup_fan_out (
14431415 self ,
14441416 generator : ColumnGeneratorWithModelRegistry ,
0 commit comments