diff --git a/src/sktime_mcp/registry/interface.py b/src/sktime_mcp/registry/interface.py index a701b338..11369390 100644 --- a/src/sktime_mcp/registry/interface.py +++ b/src/sktime_mcp/registry/interface.py @@ -12,6 +12,37 @@ logger = logging.getLogger(__name__) +# Maximum characters of docstring to expose to LLMs in to_dict() output. +# Earlier versions truncated at 500 chars, but sktime's numpydoc-style +# docstrings put the "Parameters" section, the most useful information +# for an agent reasoning about hyperparameters, past the first 500 +# chars in many estimators (issue #335). Bumping the cap to 4000 keeps +# the response compact enough for an MCP tool result while preserving +# the Parameters section for the vast majority of sktime estimators. +_DOCSTRING_MAX_CHARS = 4000 + + +def _truncate_docstring(docstring: str | None) -> str | None: + """Return *docstring* truncated to a length that preserves the + Parameters section when possible. + + The function: + - returns ``None`` for a missing docstring; + - returns the full docstring if it is already short enough; + - otherwise returns the first ``_DOCSTRING_MAX_CHARS`` characters, + with a trailing ellipsis when content was dropped. + + Per #335 the cap is large enough to capture the numpydoc + ``Parameters`` block of every estimator we have looked at without + embedding the full docstring (which can run several pages on + composite forecasters). + """ + if docstring is None: + return None + if len(docstring) <= _DOCSTRING_MAX_CHARS: + return docstring + return docstring[:_DOCSTRING_MAX_CHARS].rstrip() + "..." + @dataclass class EstimatorNode: @@ -47,9 +78,7 @@ def to_dict(self) -> dict[str, Any]: "module": self.module, "tags": self.tags, "hyperparameters": self.hyperparameters, - "docstring": ( - self.docstring[:500] if self.docstring else None - ), # L-1: Truncate docstring to 500 characters, we can also try summarization + "docstring": _truncate_docstring(self.docstring), } def to_summary(self) -> dict[str, Any]: diff --git a/tests/test_core.py b/tests/test_core.py index 67a2719d..35843824 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -250,3 +250,48 @@ def fake_save_model(**kwargs): if __name__ == "__main__": pytest.main([__file__, "-v"]) + + +class TestDocstringTruncation: + """Regression test for issue #335, bump the docstring truncation + cap so the numpydoc ``Parameters`` section, which appears past + the first 500 chars in many sktime estimators, survives the + EstimatorNode.to_dict serialisation.""" + + def test_short_docstring_unchanged(self): + from sktime_mcp.registry.interface import _truncate_docstring + + assert _truncate_docstring("hi") == "hi" + + def test_none_passes_through(self): + from sktime_mcp.registry.interface import _truncate_docstring + + assert _truncate_docstring(None) is None + + def test_long_docstring_truncated_with_ellipsis(self): + from sktime_mcp.registry.interface import ( + _DOCSTRING_MAX_CHARS, + _truncate_docstring, + ) + + long_doc = "x" * (_DOCSTRING_MAX_CHARS + 50) + out = _truncate_docstring(long_doc) + assert out is not None + assert out.endswith("...") + # Plain content (without ellipsis) is at most the cap. + assert len(out) - 3 <= _DOCSTRING_MAX_CHARS + + def test_cap_preserves_parameters_section_past_500_chars(self): + """The whole point of #335: Parameters section appears past 500 + chars in many sktime docstrings, and the new cap must keep it.""" + from sktime_mcp.registry.interface import _truncate_docstring + + intro = "x" * 600 + doc = ( + intro + + "\n\nParameters\n----------\nn_neighbors : int, default=5\n" + ) + out = _truncate_docstring(doc) + assert out is not None + assert "Parameters" in out + assert "n_neighbors" in out