Skip to content

Commit 6d83453

Browse files
author
Akangsha Goel
committed
feat(mcp-readability): compliance orchestrator, LLM judge, and metrics scorer
The evaluation half of the MCP-readability work, on top of the mcp_tools generator. For each endpoint the orchestrator fetches tools (rendered as a man page), computes deterministic size metrics, gathers applicable waivers, and judges the man page against the style guide with an LLM. One result row per endpoint is emitted through the shared EvalBench reporters (CSV/BigQuery). - McpReadabilityOrchestrator (orchestrator: mcp_readability), driven entirely by datasets/mcp_readability/run_config.yaml. - McpToolMetricsScorer: deterministic tool count / estimated tokens / token-budget usage. - McpStyleComplianceScorer: LLM judge scoring the man page vs the style guide (P0/P1/P2 findings, compliance score, waived rules), JSON output. - enums + exceptions helpers, aligned to the endpoints/exceptions schema and the readability_judge run-config block. Reuses the standard evalbench.py report path with NO changes to evalbench.py: - process() emits a real scores_tf (one standard style_compliance score row per endpoint; pass = SUCCESS with no P0 findings), so the run takes the existing results+scores branch and the shared analyzer produces a P0-clean compliance rate. run_config declares scorers: [style_compliance]. - dataset_config is made optional in the shared helpers: set_session_configs always sets it (default None) and load_dataset_from_json returns {} for a falsy path, so the datasetless orchestrator loads an empty dataset. - CsvReporter.store no-ops on None/empty frames (mirroring BigQueryReporter), so subset-only report writes are safe. Testing: pytest evalbench/test/mcp_readability_test.py evalbench/test/mcp_tool_metrics_test.py evalbench/test/evalbench_test.py — 31 passing, including an offline end-to-end orchestrator run, an analyzer-integration test proving the emitted scores aggregate correctly, and the existing evalbench.py tests (unchanged).
1 parent ab55d7e commit 6d83453

17 files changed

Lines changed: 1454 additions & 20 deletions

File tree

datasets/mcp_readability/run_config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@
1212

1313
orchestrator: mcp_readability
1414

15+
# Metric the shared analyzer aggregates for the scores / summary reports. The
16+
# orchestrator emits one `style_compliance` score row per endpoint (pass = no P0
17+
# findings); it does its own scoring, so this entry is only a label for the
18+
# analyzer. Uses the standard mapping form (scorer name -> config).
19+
scorers:
20+
style_compliance:
21+
1522
# Inputs (paths relative to the repo root / run cwd).
1623
endpoints_config: datasets/mcp_readability/endpoints.yaml
1724
exceptions_config: datasets/mcp_readability/exceptions.yaml # optional

evalbench/dataset/dataset.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,10 @@ def load_json(json_file_path):
242242

243243

244244
def load_dataset_from_json(json_file_path, config):
245+
# No dataset path (e.g. orchestrators driven by their run config rather than a
246+
# prompt dataset): nothing to load. flatten_dataset({}) yields [].
247+
if not json_file_path:
248+
return {}
245249
input_items = {}
246250
dataset_format = config.get("dataset_format", "evalbench-standard-format")
247251
if dataset_format == "bird-interact-format":

evalbench/eval_service.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@ async def ListEvalInputs(
126126
session = SESSIONMANAGER.get_session(rpc_id_var.get())
127127
logging.info("Retrieving Evals for: %s.", rpc_id_var.get())
128128
experiment_config = session["config"]
129-
dataset_config_json = experiment_config["dataset_config"]
129+
# dataset_config is optional: some orchestrators drive their work from
130+
# the run config itself, in which case load_dataset_from_json returns {}.
131+
dataset_config_json = experiment_config.get("dataset_config")
130132
dataset = load_dataset_from_json(
131133
dataset_config_json, experiment_config)
132134
for _, eval_inputs in dataset.items():
@@ -316,8 +318,9 @@ async def Interact(
316318
context.set_details(error_msg)
317319
return
318320

319-
# Load dataset and instantiate the Orchestrator
320-
dataset_config_json = config["dataset_config"]
321+
# Load dataset and instantiate the Orchestrator. dataset_config is
322+
# optional (datasetless orchestrators drive from the run config).
323+
dataset_config_json = config.get("dataset_config")
321324
dataset_dict = load_dataset_from_json(dataset_config_json, config)
322325

323326
dataset = []

evalbench/evaluator/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from evaluator.dataengineeringagentorchestrator import (
99
DataEngineeringAgentOrchestrator,
1010
)
11+
from evaluator.mcp_readability import McpReadabilityOrchestrator
1112
import logging
1213

1314

@@ -27,6 +28,10 @@ def get_orchestrator(config, db_configs, setup_config, report_progress=False):
2728
return DataEngineeringAgentOrchestrator(
2829
config, db_configs, setup_config, report_progress
2930
)
31+
elif orchestrator_type == "mcp_readability":
32+
return McpReadabilityOrchestrator(
33+
config, db_configs, setup_config, report_progress
34+
)
3035
else:
3136
return Orchestrator(config, db_configs, setup_config, report_progress)
3237

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from evaluator.mcp_readability.orchestrator import McpReadabilityOrchestrator
2+
3+
__all__ = ["McpReadabilityOrchestrator"]
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""Enums for the MCP style-guide readability/compliance check.
2+
3+
They are defined as plain Python enums here so the feature is self-contained;
4+
values can be aligned to an external schema later without touching call sites.
5+
All enums are written to the results CSV using their ``.name``.
6+
"""
7+
8+
from enum import Enum
9+
10+
11+
class EndpointType(Enum):
12+
"""Deployment channel / reporting metadata for an endpoint.
13+
14+
Used both to label result rows and (optionally) to filter which endpoints
15+
are checked via ``endpoint_types`` in the run config.
16+
"""
17+
18+
ENDPOINT_TYPE_UNSPECIFIED = 0
19+
PROD = 1
20+
AUTOPUSH = 2
21+
STAGING = 3
22+
DEV = 4
23+
24+
25+
class CheckStatus(Enum):
26+
"""Whether the compliance check *ran* successfully (not its findings).
27+
28+
Compliance results are captured separately via the p0/p1/p2 issue counts and
29+
compliance_score; this status only reflects whether the eval completed.
30+
31+
- ``SUCCESS``: the eval ran end-to-end.
32+
- ``FETCH_ERROR``: failed to retrieve tools data from the endpoint.
33+
- ``ANALYSIS_ERROR``: error during LLM analysis or result parsing.
34+
- ``INTERNAL_ERROR``: other script/system error.
35+
"""
36+
37+
CHECK_STATUS_UNSPECIFIED = 0
38+
SUCCESS = 1
39+
FETCH_ERROR = 2
40+
ANALYSIS_ERROR = 3
41+
INTERNAL_ERROR = 4
42+
43+
44+
def _coerce(enum_cls, value, default):
45+
"""Coerce a config value (str / int / enum / None) into ``enum_cls``.
46+
47+
Unknown values fall back to ``default`` rather than raising, so a typo in a
48+
config file degrades gracefully instead of aborting the whole run.
49+
"""
50+
if value is None:
51+
return default
52+
if isinstance(value, enum_cls):
53+
return value
54+
if isinstance(value, int):
55+
try:
56+
return enum_cls(value)
57+
except ValueError:
58+
return default
59+
name = str(value).strip().upper()
60+
# Allow both short ("PROD") and fully-qualified ("ENDPOINT_TYPE_PROD")
61+
# spellings, plus the raw member name.
62+
members = enum_cls.__members__
63+
if name in members:
64+
return members[name]
65+
for member_name, member in members.items():
66+
if name.endswith("_" + member_name):
67+
return member
68+
return default
69+
70+
71+
def coerce_endpoint_type(value) -> EndpointType:
72+
return _coerce(EndpointType, value, EndpointType.ENDPOINT_TYPE_UNSPECIFIED)
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Loading and matching of per-endpoint style-rule exceptions (waivers).
2+
3+
An exceptions file lets an operator waive a specific style requirement for a
4+
specific endpoint when it legitimately cannot comply. Waived rules are passed to
5+
the scorer so they are excluded from the P0/P1/P2 counts and surfaced separately
6+
in the feedback.
7+
8+
File schema (see ``datasets/mcp_readability/exceptions.yaml``)::
9+
10+
exceptions:
11+
- product_name: "Cloud SQL" # any of product_name / endpoint_type
12+
rule_id: "tool-names"
13+
reason: "..."
14+
- endpoint_type: AUTOPUSH # "*" or omitted field = match-all
15+
rule_id: "use-enums"
16+
reason: "..."
17+
"""
18+
19+
import logging
20+
from util.config import load_yaml_config
21+
22+
23+
def load_exceptions(path: str) -> list[dict]:
24+
"""Load the exceptions list from a YAML file. Missing/empty -> []."""
25+
if not path:
26+
return []
27+
parsed = load_yaml_config(path)
28+
if not parsed:
29+
return []
30+
exceptions = parsed.get("exceptions") or []
31+
if not isinstance(exceptions, list):
32+
logging.warning(
33+
"mcp_readability: 'exceptions' in %s is not a list; ignoring.", path
34+
)
35+
return []
36+
return exceptions
37+
38+
39+
def _matches(field_value, exception_value) -> bool:
40+
"""A matcher field matches when it is absent, '*', or equal (case-insensitive)."""
41+
if exception_value is None or exception_value == "*":
42+
return True
43+
if field_value is None:
44+
return False
45+
return str(field_value).strip().lower() == str(exception_value).strip().lower()
46+
47+
48+
def applicable_exceptions(endpoint: dict, all_exceptions: list[dict]) -> list[dict]:
49+
"""Return exceptions whose matchers all apply to ``endpoint``.
50+
51+
Matchers considered: ``product_name`` and ``endpoint_type`` (the endpoint's
52+
identity in #469's ``endpoints.yaml``). Each exception keeps its ``rule_id``
53+
and ``reason`` for the scorer prompt.
54+
"""
55+
matched = []
56+
for exc in all_exceptions:
57+
if not isinstance(exc, dict):
58+
continue
59+
if not exc.get("rule_id"):
60+
continue
61+
if (
62+
_matches(endpoint.get("product_name"), exc.get("product_name"))
63+
and _matches(endpoint.get("endpoint_type"), exc.get("endpoint_type"))
64+
):
65+
matched.append(
66+
{
67+
"rule_id": exc.get("rule_id"),
68+
"reason": exc.get("reason", ""),
69+
}
70+
)
71+
return matched

0 commit comments

Comments
 (0)