GoogleCloudPlatform
diff --git a/‎datasets/mcp_readability/run_config.yaml‎
Lines changed: 7 additions & 0 deletions b/‎datasets/mcp_readability/run_config.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎evalbench/dataset/dataset.py‎
Lines changed: 4 additions & 0 deletions b/‎evalbench/dataset/dataset.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎evalbench/eval_service.py‎
Lines changed: 6 additions & 3 deletions b/‎evalbench/eval_service.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎evalbench/evaluator/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎evalbench/evaluator/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎evalbench/evaluator/mcp_readability/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎evalbench/evaluator/mcp_readability/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎evalbench/evaluator/mcp_readability/enums.py‎
Lines changed: 72 additions & 0 deletions b/‎evalbench/evaluator/mcp_readability/enums.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎evalbench/evaluator/mcp_readability/exceptions.py‎
Lines changed: 71 additions & 0 deletions b/‎evalbench/evaluator/mcp_readability/exceptions.py‎
Lines changed: 71 additions & 0 deletions
@@ -12,6 +12,13 @@
 
 orchestrator: mcp_readability
 
+# Metric the shared analyzer aggregates for the scores / summary reports. The
+# orchestrator emits one `style_compliance` score row per endpoint (pass = no P0
+# findings); it does its own scoring, so this entry is only a label for the
+# analyzer. Uses the standard mapping form (scorer name -> config).
+scorers:
+  style_compliance:
+
 # Inputs (paths relative to the repo root / run cwd).
 endpoints_config: datasets/mcp_readability/endpoints.yaml
 exceptions_config: datasets/mcp_readability/exceptions.yaml          # optional
 
@@ -242,6 +242,10 @@ def load_json(json_file_path):
 
 
 def load_dataset_from_json(json_file_path, config):
+    # No dataset path (e.g. orchestrators driven by their run config rather than a
+    # prompt dataset): nothing to load. flatten_dataset({}) yields [].
+    if not json_file_path:
+        return {}
     input_items = {}
     dataset_format = config.get("dataset_format", "evalbench-standard-format")
     if dataset_format == "bird-interact-format":
 
@@ -126,7 +126,9 @@ async def ListEvalInputs(
         session = SESSIONMANAGER.get_session(rpc_id_var.get())
         logging.info("Retrieving Evals for: %s.", rpc_id_var.get())
         experiment_config = session["config"]
-        dataset_config_json = experiment_config["dataset_config"]
+        # dataset_config is optional: some orchestrators drive their work from
+        # the run config itself, in which case load_dataset_from_json returns {}.
+        dataset_config_json = experiment_config.get("dataset_config")
         dataset = load_dataset_from_json(
             dataset_config_json, experiment_config)
         for _, eval_inputs in dataset.items():
@@ -316,8 +318,9 @@ async def Interact(
             context.set_details(error_msg)
             return
 
-        # Load dataset and instantiate the Orchestrator
-        dataset_config_json = config["dataset_config"]
+        # Load dataset and instantiate the Orchestrator. dataset_config is
+        # optional (datasetless orchestrators drive from the run config).
+        dataset_config_json = config.get("dataset_config")
         dataset_dict = load_dataset_from_json(dataset_config_json, config)
 
         dataset = []
 
@@ -8,6 +8,7 @@
 from evaluator.dataengineeringagentorchestrator import (
     DataEngineeringAgentOrchestrator,
 )
+from evaluator.mcp_readability import McpReadabilityOrchestrator
 import logging
 
 
@@ -27,6 +28,10 @@ def get_orchestrator(config, db_configs, setup_config, report_progress=False):
         return DataEngineeringAgentOrchestrator(
             config, db_configs, setup_config, report_progress
         )
+    elif orchestrator_type == "mcp_readability":
+        return McpReadabilityOrchestrator(
+            config, db_configs, setup_config, report_progress
+        )
     else:
         return Orchestrator(config, db_configs, setup_config, report_progress)
 
 
@@ -0,0 +1,3 @@
+from evaluator.mcp_readability.orchestrator import McpReadabilityOrchestrator
+
+__all__ = ["McpReadabilityOrchestrator"]
@@ -0,0 +1,72 @@
+"""Enums for the MCP style-guide readability/compliance check.
+
+They are defined as plain Python enums here so the feature is self-contained;
+values can be aligned to an external schema later without touching call sites.
+All enums are written to the results CSV using their ``.name``.
+"""
+
+from enum import Enum
+
+
+class EndpointType(Enum):
+    """Deployment channel / reporting metadata for an endpoint.
+
+    Used both to label result rows and (optionally) to filter which endpoints
+    are checked via ``endpoint_types`` in the run config.
+    """
+
+    ENDPOINT_TYPE_UNSPECIFIED = 0
+    PROD = 1
+    AUTOPUSH = 2
+    STAGING = 3
+    DEV = 4
+
+
+class CheckStatus(Enum):
+    """Whether the compliance check *ran* successfully (not its findings).
+
+    Compliance results are captured separately via the p0/p1/p2 issue counts and
+    compliance_score; this status only reflects whether the eval completed.
+
+    - ``SUCCESS``: the eval ran end-to-end.
+    - ``FETCH_ERROR``: failed to retrieve tools data from the endpoint.
+    - ``ANALYSIS_ERROR``: error during LLM analysis or result parsing.
+    - ``INTERNAL_ERROR``: other script/system error.
+    """
+
+    CHECK_STATUS_UNSPECIFIED = 0
+    SUCCESS = 1
+    FETCH_ERROR = 2
+    ANALYSIS_ERROR = 3
+    INTERNAL_ERROR = 4
+
+
+def _coerce(enum_cls, value, default):
+    """Coerce a config value (str / int / enum / None) into ``enum_cls``.
+
+    Unknown values fall back to ``default`` rather than raising, so a typo in a
+    config file degrades gracefully instead of aborting the whole run.
+    """
+    if value is None:
+        return default
+    if isinstance(value, enum_cls):
+        return value
+    if isinstance(value, int):
+        try:
+            return enum_cls(value)
+        except ValueError:
+            return default
+    name = str(value).strip().upper()
+    # Allow both short ("PROD") and fully-qualified ("ENDPOINT_TYPE_PROD")
+    # spellings, plus the raw member name.
+    members = enum_cls.__members__
+    if name in members:
+        return members[name]
+    for member_name, member in members.items():
+        if name.endswith("_" + member_name):
+            return member
+    return default
+
+
+def coerce_endpoint_type(value) -> EndpointType:
+    return _coerce(EndpointType, value, EndpointType.ENDPOINT_TYPE_UNSPECIFIED)
@@ -0,0 +1,71 @@
+"""Loading and matching of per-endpoint style-rule exceptions (waivers).
+
+An exceptions file lets an operator waive a specific style requirement for a
+specific endpoint when it legitimately cannot comply. Waived rules are passed to
+the scorer so they are excluded from the P0/P1/P2 counts and surfaced separately
+in the feedback.
+
+File schema (see ``datasets/mcp_readability/exceptions.yaml``)::
+
+    exceptions:
+      - product_name: "Cloud SQL"     # any of product_name / endpoint_type
+        rule_id: "tool-names"
+        reason: "..."
+      - endpoint_type: AUTOPUSH       # "*" or omitted field = match-all
+        rule_id: "use-enums"
+        reason: "..."
+"""
+
+import logging
+from util.config import load_yaml_config
+
+
+def load_exceptions(path: str) -> list[dict]:
+    """Load the exceptions list from a YAML file. Missing/empty -> []."""
+    if not path:
+        return []
+    parsed = load_yaml_config(path)
+    if not parsed:
+        return []
+    exceptions = parsed.get("exceptions") or []
+    if not isinstance(exceptions, list):
+        logging.warning(
+            "mcp_readability: 'exceptions' in %s is not a list; ignoring.", path
+        )
+        return []
+    return exceptions
+
+
+def _matches(field_value, exception_value) -> bool:
+    """A matcher field matches when it is absent, '*', or equal (case-insensitive)."""
+    if exception_value is None or exception_value == "*":
+        return True
+    if field_value is None:
+        return False
+    return str(field_value).strip().lower() == str(exception_value).strip().lower()
+
+
+def applicable_exceptions(endpoint: dict, all_exceptions: list[dict]) -> list[dict]:
+    """Return exceptions whose matchers all apply to ``endpoint``.
+
+    Matchers considered: ``product_name`` and ``endpoint_type`` (the endpoint's
+    identity in #469's ``endpoints.yaml``). Each exception keeps its ``rule_id``
+    and ``reason`` for the scorer prompt.
+    """
+    matched = []
+    for exc in all_exceptions:
+        if not isinstance(exc, dict):
+            continue
+        if not exc.get("rule_id"):
+            continue
+        if (
+            _matches(endpoint.get("product_name"), exc.get("product_name"))
+            and _matches(endpoint.get("endpoint_type"), exc.get("endpoint_type"))
+        ):
+            matched.append(
+                {
+                    "rule_id": exc.get("rule_id"),
+                    "reason": exc.get("reason", ""),
+                }
+            )
+    return matched
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from evaluator.mcp_readability.orchestrator import McpReadabilityOrchestrator`
	`2`	`+`
	`3`	`+__all__ = ["McpReadabilityOrchestrator"]`