diff --git a/packages/sample-app/sample_app/experiment/made_by_traceloop/agent_tool_trajectory.py b/packages/sample-app/sample_app/experiment/made_by_traceloop/agent_tool_trajectory.py new file mode 100644 index 0000000000..34a73dd68b --- /dev/null +++ b/packages/sample-app/sample_app/experiment/made_by_traceloop/agent_tool_trajectory.py @@ -0,0 +1,85 @@ +""" +Agent Tool Trajectory Experiment + +This example demonstrates Traceloop's agent tool trajectory evaluator: +- Agent Tool Trajectory: Validates the agent tool trajectory + +This evaluator helps ensure your AI agents perform optimally and follow the expected tool trajectory. +""" + +import asyncio +from traceloop.sdk import Traceloop +from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + +# Initialize Traceloop +client = Traceloop.init() + + +def agent_evaluators_task(row): + executed_tool_calls = row.get("actual", "") + default_expected = ( + "[{'name': 'search', 'input': {'query': 'weather'}}, " + "{'name': 'book_flight', 'input': {'flight': 'NYC to Paris'}}, " + "{'name': 'get_confirmation', 'input': {'confirmation': 'flight booked'}}]" + ) + expected_tool_calls = row.get("expected", default_expected) + + return { + "executed_tool_calls": executed_tool_calls, + "expected_tool_calls": expected_tool_calls, + } + + +async def run_agent_tool_trajectory_experiment(): + print("\n" + "="*80) + print("AGENT TOOL TRAJECTORY EXPERIMENT") + print("="*80 + "\n") + print("This experiment will test the agent tool trajectory with the agent tool trajectory evaluator:\n") + print("1. Agent Tool Trajectory - Validates the agent tool trajectory") + print("\n" + "-"*80 + "\n") + + # Configure agent evaluators + evaluators = [ + EvaluatorMadeByTraceloop.agent_tool_trajectory( + input_params_sensitive=True, + mismatch_sensitive=False, + order_sensitive=False, + threshold=0.7, + ), + ] + + print("Running experiment with evaluators:") + for evaluator in evaluators: + print(f" - {evaluator.slug}") + + print("\n" + "-"*80 + "\n") + + # Run the experiment + # Note: You'll need to create a dataset with appropriate test cases for agents + results, errors = await client.experiment.run( + dataset_slug="agent-tool-trajectory", # Set a dataset slug that exists in the traceloop platform + dataset_version="v1", + task=agent_evaluators_task, + evaluators=evaluators, + experiment_slug="agent-tool-trajectory-exp", + stop_on_error=False, + wait_for_results=True, + ) + + print("\n" + "="*80) + print("Agent tool trajectory experiment completed!") + print("="*80 + "\n") + + print("Results summary:") + print(f" - Total rows processed: {len(results) if results else 0}") + print(f" - Errors encountered: {len(errors) if errors else 0}") + + if errors: + print("\nErrors:") + for error in errors: + print(f" - {error}") + +if __name__ == "__main__": + print("\nAgent Tool Trajectory Experiment\n") + + asyncio.run(run_agent_tool_trajectory_experiment()) diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py index 618c701c3c..c9f36629c0 100644 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py @@ -1,10 +1,9 @@ from .evaluator import Evaluator from .config import EvaluatorDetails -from .evaluators_made_by_traceloop import EvaluatorMadeByTraceloop, create_evaluator +from ..generated.evaluators.definitions import EvaluatorMadeByTraceloop __all__ = [ "Evaluator", "EvaluatorDetails", "EvaluatorMadeByTraceloop", - "create_evaluator", ] diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py index 6fa72d0f3d..06c8ef43ee 100644 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py @@ -28,7 +28,8 @@ def _validate_evaluator_input(slug: str, input: Dict[str, str]) -> None: request_model = get_request_model(slug) if request_model: try: - request_model(**input) + # Request models expect data nested under 'input' field + request_model(input=input) except ValidationError as e: raise ValueError(f"Invalid input for '{slug}': {e}") from e diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py deleted file mode 100644 index 75c2da1931..0000000000 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -Factory class for creating Traceloop evaluators with proper configuration. - -This module dynamically generates factory methods from the generated.evaluators registry. -""" - -from typing import Any, List - -from ..generated.evaluators import REQUEST_MODELS -from .config import EvaluatorDetails - - -def _get_required_fields(slug: str) -> List[str]: - """Get required input fields for an evaluator from its request model.""" - model = REQUEST_MODELS.get(slug) - if not model: - return [] - return [name for name, field in model.model_fields.items() if field.is_required()] - - -def _get_config_fields(slug: str) -> dict: - """Get config fields (non-required) with their defaults from the request model.""" - model = REQUEST_MODELS.get(slug) - if not model: - return {} - config_fields = {} - for name, field in model.model_fields.items(): - if not field.is_required(): - config_fields[name] = field.default - return config_fields - - -def _slug_to_method_name(slug: str) -> str: - """Convert slug like 'pii-detector' to method name like 'pii_detector'.""" - return slug.replace("-", "_") - - -def _method_name_to_slug(method_name: str) -> str: - """Convert method name like 'pii_detector' to slug like 'pii-detector'.""" - return method_name.replace("_", "-") - - -def create_evaluator(slug: str, **config: Any) -> EvaluatorDetails: - """Create an EvaluatorDetails for the given slug with optional config. - - Args: - slug: The evaluator slug (e.g., "pii-detector") - **config: Configuration options for the evaluator - - Returns: - EvaluatorDetails configured for the specified evaluator - - Example: - >>> from traceloop.sdk.evaluator import create_evaluator - >>> evaluator = create_evaluator("pii-detector", probability_threshold=0.8) - """ - if slug not in REQUEST_MODELS: - available = ", ".join(sorted(REQUEST_MODELS.keys())) - raise ValueError(f"Unknown evaluator slug: '{slug}'. Available: {available}") - - # Remove None values from config - config = {k: v for k, v in config.items() if v is not None} - return EvaluatorDetails( - slug=slug, - version=None, - config=config, - required_input_fields=_get_required_fields(slug), - ) - - -class _EvaluatorMadeByTraceloopMeta(type): - """Metaclass that dynamically generates evaluator factory methods.""" - - def __getattr__(cls, name: str) -> Any: - """Dynamically create factory methods for any evaluator slug.""" - slug = _method_name_to_slug(name) - if slug in REQUEST_MODELS: - - def factory(**config: Any) -> EvaluatorDetails: - return create_evaluator(slug, **config) - - factory.__name__ = name - config_fields = list(_get_config_fields(slug).keys()) or "none" - factory.__doc__ = f"Create {slug} evaluator. Config fields: {config_fields}" - return factory - raise AttributeError(f"'{cls.__name__}' has no attribute '{name}'") - - def __dir__(cls) -> List[str]: - """List all available evaluator methods.""" - methods = list(super().__dir__()) - for slug in REQUEST_MODELS: - methods.append(_slug_to_method_name(slug)) - return methods - - -class EvaluatorMadeByTraceloop(metaclass=_EvaluatorMadeByTraceloopMeta): - """ - Factory class for creating Traceloop evaluators with proper configuration. - - All evaluator slugs from the registry are available as methods. - Methods are dynamically generated from REQUEST_MODELS. - - Example: - >>> from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop - >>> - >>> evaluators = [ - ... EvaluatorMadeByTraceloop.pii_detector(probability_threshold=0.8), - ... EvaluatorMadeByTraceloop.toxicity_detector(threshold=0.7), - ... EvaluatorMadeByTraceloop.faithfulness(), - ... ] - - Available evaluators (auto-generated from registry): - - pii_detector, toxicity_detector, prompt_injection - - regex_validator, json_validator, sql_validator - - faithfulness, answer_relevancy, context_relevance - - agent_goal_accuracy, agent_efficiency, agent_flow_quality - - and more... (use dir(EvaluatorMadeByTraceloop) to see all) - """ - - pass diff --git a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py index 5241cf6102..ce544faecd 100644 --- a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py +++ b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py @@ -5,41 +5,91 @@ # ./scripts/generate-models.sh /path/to/swagger.json from .request import ( + AgentEfficiencyInput, AgentEfficiencyRequest, + AgentFlowQualityConfigRequest, + AgentFlowQualityInput, AgentFlowQualityRequest, + AgentGoalAccuracyInput, AgentGoalAccuracyRequest, + AgentGoalCompletenessConfigRequest, + AgentGoalCompletenessInput, AgentGoalCompletenessRequest, + AgentToolErrorDetectorInput, AgentToolErrorDetectorRequest, + AgentToolTrajectoryConfigRequest, + AgentToolTrajectoryInput, + AgentToolTrajectoryRequest, + AnswerCompletenessInput, AnswerCompletenessRequest, + AnswerCorrectnessInput, AnswerCorrectnessRequest, + AnswerRelevancyInput, AnswerRelevancyRequest, + CharCountInput, + CharCountRatioInput, CharCountRatioRequest, CharCountRequest, + ContextRelevanceConfigRequest, + ContextRelevanceInput, ContextRelevanceRequest, + ConversationQualityInput, ConversationQualityRequest, + FaithfulnessInput, FaithfulnessRequest, + HtmlComparisonInput, + HtmlComparisonRequest, + InstructionAdherenceInput, InstructionAdherenceRequest, + IntentChangeInput, IntentChangeRequest, + JSONValidatorConfigRequest, + JSONValidatorInput, JSONValidatorRequest, + PIIDetectorConfigRequest, + PIIDetectorInput, PIIDetectorRequest, + PerplexityInput, PerplexityRequest, + PlaceholderRegexConfigRequest, + PlaceholderRegexInput, PlaceholderRegexRequest, + ProfanityDetectorInput, ProfanityDetectorRequest, + PromptInjectionConfigRequest, + PromptInjectionInput, PromptInjectionRequest, + PromptPerplexityInput, PromptPerplexityRequest, + RegexValidatorConfigRequest, + RegexValidatorInput, RegexValidatorRequest, + SQLValidatorInput, SQLValidatorRequest, + SecretsDetectorInput, SecretsDetectorRequest, + SemanticSimilarityInput, SemanticSimilarityRequest, + SexismDetectorConfigRequest, + SexismDetectorInput, SexismDetectorRequest, + ToneDetectionInput, ToneDetectionRequest, + TopicAdherenceInput, TopicAdherenceRequest, + ToxicityDetectorConfigRequest, + ToxicityDetectorInput, ToxicityDetectorRequest, + UncertaintyDetectorInput, UncertaintyDetectorRequest, + WordCountInput, + WordCountRatioInput, WordCountRatioRequest, WordCountRequest, ) +from .definitions import EvaluatorMadeByTraceloop + from .registry import ( REQUEST_MODELS, RESPONSE_MODELS, @@ -53,6 +103,7 @@ AgentGoalAccuracyResponse, AgentGoalCompletenessResponse, AgentToolErrorDetectorResponse, + AgentToolTrajectoryResponse, AnswerCompletenessResponse, AnswerCorrectnessResponse, AnswerRelevancyResponse, @@ -62,6 +113,7 @@ ConversationQualityResponse, ErrorResponse, FaithfulnessResponse, + HtmlComparisonResponse, InstructionAdherenceResponse, IntentChangeResponse, JSONValidatorResponse, @@ -85,43 +137,93 @@ ) __all__ = [ + # Factory class + "EvaluatorMadeByTraceloop", # Registry functions "REQUEST_MODELS", "RESPONSE_MODELS", "get_request_model", "get_response_model", # Evaluator request models + "AgentEfficiencyInput", "AgentEfficiencyRequest", + "AgentFlowQualityConfigRequest", + "AgentFlowQualityInput", "AgentFlowQualityRequest", + "AgentGoalAccuracyInput", "AgentGoalAccuracyRequest", + "AgentGoalCompletenessConfigRequest", + "AgentGoalCompletenessInput", "AgentGoalCompletenessRequest", + "AgentToolErrorDetectorInput", "AgentToolErrorDetectorRequest", + "AgentToolTrajectoryConfigRequest", + "AgentToolTrajectoryInput", + "AgentToolTrajectoryRequest", + "AnswerCompletenessInput", "AnswerCompletenessRequest", + "AnswerCorrectnessInput", "AnswerCorrectnessRequest", + "AnswerRelevancyInput", "AnswerRelevancyRequest", + "CharCountInput", + "CharCountRatioInput", "CharCountRatioRequest", "CharCountRequest", + "ContextRelevanceConfigRequest", + "ContextRelevanceInput", "ContextRelevanceRequest", + "ConversationQualityInput", "ConversationQualityRequest", + "FaithfulnessInput", "FaithfulnessRequest", + "HtmlComparisonInput", + "HtmlComparisonRequest", + "InstructionAdherenceInput", "InstructionAdherenceRequest", + "IntentChangeInput", "IntentChangeRequest", + "JSONValidatorConfigRequest", + "JSONValidatorInput", "JSONValidatorRequest", + "PIIDetectorConfigRequest", + "PIIDetectorInput", "PIIDetectorRequest", + "PerplexityInput", "PerplexityRequest", + "PlaceholderRegexConfigRequest", + "PlaceholderRegexInput", "PlaceholderRegexRequest", + "ProfanityDetectorInput", "ProfanityDetectorRequest", + "PromptInjectionConfigRequest", + "PromptInjectionInput", "PromptInjectionRequest", + "PromptPerplexityInput", "PromptPerplexityRequest", + "RegexValidatorConfigRequest", + "RegexValidatorInput", "RegexValidatorRequest", + "SQLValidatorInput", "SQLValidatorRequest", + "SecretsDetectorInput", "SecretsDetectorRequest", + "SemanticSimilarityInput", "SemanticSimilarityRequest", + "SexismDetectorConfigRequest", + "SexismDetectorInput", "SexismDetectorRequest", + "ToneDetectionInput", "ToneDetectionRequest", + "TopicAdherenceInput", "TopicAdherenceRequest", + "ToxicityDetectorConfigRequest", + "ToxicityDetectorInput", "ToxicityDetectorRequest", + "UncertaintyDetectorInput", "UncertaintyDetectorRequest", + "WordCountInput", + "WordCountRatioInput", "WordCountRatioRequest", "WordCountRequest", # Evaluator response models @@ -130,6 +232,7 @@ "AgentGoalAccuracyResponse", "AgentGoalCompletenessResponse", "AgentToolErrorDetectorResponse", + "AgentToolTrajectoryResponse", "AnswerCompletenessResponse", "AnswerCorrectnessResponse", "AnswerRelevancyResponse", @@ -139,6 +242,7 @@ "ConversationQualityResponse", "ErrorResponse", "FaithfulnessResponse", + "HtmlComparisonResponse", "InstructionAdherenceResponse", "IntentChangeResponse", "JSONValidatorResponse", diff --git a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/definitions.py b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/definitions.py new file mode 100644 index 0000000000..56689a6bf9 --- /dev/null +++ b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/definitions.py @@ -0,0 +1,548 @@ +""" +Factory methods for creating Traceloop evaluators. + +Provides type-safe factory methods with IDE autocomplete support. + +DO NOT EDIT MANUALLY - Regenerate with: + ./scripts/generate-models.sh /path/to/swagger.json +""" +from __future__ import annotations + +from ...evaluator.config import EvaluatorDetails + + +class EvaluatorMadeByTraceloop: + """ + Factory class for creating Traceloop evaluators with type-safe configuration. + + Each method creates an EvaluatorDetails instance for a specific evaluator, + with properly typed configuration parameters. + + Example: + >>> from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop + >>> + >>> evaluators = [ + ... EvaluatorMadeByTraceloop.pii_detector(probability_threshold=0.8), + ... EvaluatorMadeByTraceloop.toxicity_detector(threshold=0.7), + ... EvaluatorMadeByTraceloop.faithfulness(), + ... ] + """ + + @staticmethod + def agent_efficiency() -> EvaluatorDetails: + """Create agent-efficiency evaluator. + + Required input fields: trajectory_completions, trajectory_prompts + """ + return EvaluatorDetails( + slug="agent-efficiency", + required_input_fields=['trajectory_completions', 'trajectory_prompts'], + ) + + @staticmethod + def agent_flow_quality( + conditions: list[str], + threshold: float, + ) -> EvaluatorDetails: + """Create agent-flow-quality evaluator. + + Args: + conditions: list[str] + threshold: float + + Required input fields: trajectory_completions, trajectory_prompts + """ + config = { + k: v for k, v in {"conditions": conditions, "threshold": threshold}.items() + if v is not None + } + return EvaluatorDetails( + slug="agent-flow-quality", + config=config if config else None, + required_input_fields=['trajectory_completions', 'trajectory_prompts'], + ) + + @staticmethod + def agent_goal_accuracy() -> EvaluatorDetails: + """Create agent-goal-accuracy evaluator. + + Required input fields: completion, question, reference + """ + return EvaluatorDetails( + slug="agent-goal-accuracy", + required_input_fields=['completion', 'question', 'reference'], + ) + + @staticmethod + def agent_goal_completeness( + threshold: float, + ) -> EvaluatorDetails: + """Create agent-goal-completeness evaluator. + + Args: + threshold: float + + Required input fields: trajectory_completions, trajectory_prompts + """ + config = { + k: v for k, v in {"threshold": threshold}.items() + if v is not None + } + return EvaluatorDetails( + slug="agent-goal-completeness", + config=config if config else None, + required_input_fields=['trajectory_completions', 'trajectory_prompts'], + ) + + @staticmethod + def agent_tool_error_detector() -> EvaluatorDetails: + """Create agent-tool-error-detector evaluator. + + Required input fields: tool_input, tool_output + """ + return EvaluatorDetails( + slug="agent-tool-error-detector", + required_input_fields=['tool_input', 'tool_output'], + ) + + @staticmethod + def agent_tool_trajectory( + input_params_sensitive: bool | None = None, + mismatch_sensitive: bool | None = None, + order_sensitive: bool | None = None, + threshold: float | None = None, + ) -> EvaluatorDetails: + """Create agent-tool-trajectory evaluator. + + Args: + input_params_sensitive: bool + mismatch_sensitive: bool + order_sensitive: bool + threshold: float + + Required input fields: executed_tool_calls, expected_tool_calls + """ + config = { + k: v for k, v in {"input_params_sensitive": input_params_sensitive, "mismatch_sensitive": mismatch_sensitive, "order_sensitive": order_sensitive, "threshold": threshold}.items() + if v is not None + } + return EvaluatorDetails( + slug="agent-tool-trajectory", + config=config if config else None, + required_input_fields=['executed_tool_calls', 'expected_tool_calls'], + ) + + @staticmethod + def answer_completeness() -> EvaluatorDetails: + """Create answer-completeness evaluator. + + Required input fields: completion, context, question + """ + return EvaluatorDetails( + slug="answer-completeness", + required_input_fields=['completion', 'context', 'question'], + ) + + @staticmethod + def answer_correctness() -> EvaluatorDetails: + """Create answer-correctness evaluator. + + Required input fields: completion, ground_truth, question + """ + return EvaluatorDetails( + slug="answer-correctness", + required_input_fields=['completion', 'ground_truth', 'question'], + ) + + @staticmethod + def answer_relevancy() -> EvaluatorDetails: + """Create answer-relevancy evaluator. + + Required input fields: answer, question + """ + return EvaluatorDetails( + slug="answer-relevancy", + required_input_fields=['answer', 'question'], + ) + + @staticmethod + def char_count() -> EvaluatorDetails: + """Create char-count evaluator. + + Required input fields: text + """ + return EvaluatorDetails( + slug="char-count", + required_input_fields=['text'], + ) + + @staticmethod + def char_count_ratio() -> EvaluatorDetails: + """Create char-count-ratio evaluator. + + Required input fields: denominator_text, numerator_text + """ + return EvaluatorDetails( + slug="char-count-ratio", + required_input_fields=['denominator_text', 'numerator_text'], + ) + + @staticmethod + def context_relevance( + model: str | None = None, + ) -> EvaluatorDetails: + """Create context-relevance evaluator. + + Args: + model: str + + Required input fields: context, query + """ + config = { + k: v for k, v in {"model": model}.items() + if v is not None + } + return EvaluatorDetails( + slug="context-relevance", + config=config if config else None, + required_input_fields=['context', 'query'], + ) + + @staticmethod + def conversation_quality() -> EvaluatorDetails: + """Create conversation-quality evaluator. + + Required input fields: completions, prompts + """ + return EvaluatorDetails( + slug="conversation-quality", + required_input_fields=['completions', 'prompts'], + ) + + @staticmethod + def faithfulness() -> EvaluatorDetails: + """Create faithfulness evaluator. + + Required input fields: completion, context, question + """ + return EvaluatorDetails( + slug="faithfulness", + required_input_fields=['completion', 'context', 'question'], + ) + + @staticmethod + def html_comparison() -> EvaluatorDetails: + """Create html-comparison evaluator. + + Required input fields: html1, html2 + """ + return EvaluatorDetails( + slug="html-comparison", + required_input_fields=['html1', 'html2'], + ) + + @staticmethod + def instruction_adherence() -> EvaluatorDetails: + """Create instruction-adherence evaluator. + + Required input fields: instructions, response + """ + return EvaluatorDetails( + slug="instruction-adherence", + required_input_fields=['instructions', 'response'], + ) + + @staticmethod + def intent_change() -> EvaluatorDetails: + """Create intent-change evaluator. + + Required input fields: completions, prompts + """ + return EvaluatorDetails( + slug="intent-change", + required_input_fields=['completions', 'prompts'], + ) + + @staticmethod + def json_validator( + enable_schema_validation: bool | None = None, + schema_string: str | None = None, + ) -> EvaluatorDetails: + """Create json-validator evaluator. + + Args: + enable_schema_validation: bool + schema_string: str + + Required input fields: text + """ + config = { + k: v for k, v in {"enable_schema_validation": enable_schema_validation, "schema_string": schema_string}.items() + if v is not None + } + return EvaluatorDetails( + slug="json-validator", + config=config if config else None, + required_input_fields=['text'], + ) + + @staticmethod + def perplexity() -> EvaluatorDetails: + """Create perplexity evaluator. + + Required input fields: logprobs + """ + return EvaluatorDetails( + slug="perplexity", + required_input_fields=['logprobs'], + ) + + @staticmethod + def pii_detector( + probability_threshold: float | None = None, + ) -> EvaluatorDetails: + """Create pii-detector evaluator. + + Args: + probability_threshold: float + + Required input fields: text + """ + config = { + k: v for k, v in {"probability_threshold": probability_threshold}.items() + if v is not None + } + return EvaluatorDetails( + slug="pii-detector", + config=config if config else None, + required_input_fields=['text'], + ) + + @staticmethod + def placeholder_regex( + case_sensitive: bool | None = None, + dot_include_nl: bool | None = None, + multi_line: bool | None = None, + should_match: bool | None = None, + ) -> EvaluatorDetails: + """Create placeholder-regex evaluator. + + Args: + case_sensitive: bool + dot_include_nl: bool + multi_line: bool + should_match: bool + + Required input fields: placeholder_value, text + """ + config = { + k: v for k, v in {"case_sensitive": case_sensitive, "dot_include_nl": dot_include_nl, "multi_line": multi_line, "should_match": should_match}.items() + if v is not None + } + return EvaluatorDetails( + slug="placeholder-regex", + config=config if config else None, + required_input_fields=['placeholder_value', 'text'], + ) + + @staticmethod + def profanity_detector() -> EvaluatorDetails: + """Create profanity-detector evaluator. + + Required input fields: text + """ + return EvaluatorDetails( + slug="profanity-detector", + required_input_fields=['text'], + ) + + @staticmethod + def prompt_injection( + threshold: float | None = None, + ) -> EvaluatorDetails: + """Create prompt-injection evaluator. + + Args: + threshold: float + + Required input fields: prompt + """ + config = { + k: v for k, v in {"threshold": threshold}.items() + if v is not None + } + return EvaluatorDetails( + slug="prompt-injection", + config=config if config else None, + required_input_fields=['prompt'], + ) + + @staticmethod + def prompt_perplexity() -> EvaluatorDetails: + """Create prompt-perplexity evaluator. + + Required input fields: prompt + """ + return EvaluatorDetails( + slug="prompt-perplexity", + required_input_fields=['prompt'], + ) + + @staticmethod + def regex_validator( + case_sensitive: bool | None = None, + dot_include_nl: bool | None = None, + multi_line: bool | None = None, + regex: str | None = None, + should_match: bool | None = None, + ) -> EvaluatorDetails: + """Create regex-validator evaluator. + + Args: + case_sensitive: bool + dot_include_nl: bool + multi_line: bool + regex: str + should_match: bool + + Required input fields: text + """ + config = { + k: v for k, v in {"case_sensitive": case_sensitive, "dot_include_nl": dot_include_nl, "multi_line": multi_line, "regex": regex, "should_match": should_match}.items() + if v is not None + } + return EvaluatorDetails( + slug="regex-validator", + config=config if config else None, + required_input_fields=['text'], + ) + + @staticmethod + def secrets_detector() -> EvaluatorDetails: + """Create secrets-detector evaluator. + + Required input fields: text + """ + return EvaluatorDetails( + slug="secrets-detector", + required_input_fields=['text'], + ) + + @staticmethod + def semantic_similarity() -> EvaluatorDetails: + """Create semantic-similarity evaluator. + + Required input fields: completion, reference + """ + return EvaluatorDetails( + slug="semantic-similarity", + required_input_fields=['completion', 'reference'], + ) + + @staticmethod + def sexism_detector( + threshold: float | None = None, + ) -> EvaluatorDetails: + """Create sexism-detector evaluator. + + Args: + threshold: float + + Required input fields: text + """ + config = { + k: v for k, v in {"threshold": threshold}.items() + if v is not None + } + return EvaluatorDetails( + slug="sexism-detector", + config=config if config else None, + required_input_fields=['text'], + ) + + @staticmethod + def sql_validator() -> EvaluatorDetails: + """Create sql-validator evaluator. + + Required input fields: text + """ + return EvaluatorDetails( + slug="sql-validator", + required_input_fields=['text'], + ) + + @staticmethod + def tone_detection() -> EvaluatorDetails: + """Create tone-detection evaluator. + + Required input fields: text + """ + return EvaluatorDetails( + slug="tone-detection", + required_input_fields=['text'], + ) + + @staticmethod + def topic_adherence() -> EvaluatorDetails: + """Create topic-adherence evaluator. + + Required input fields: completion, question, reference_topics + """ + return EvaluatorDetails( + slug="topic-adherence", + required_input_fields=['completion', 'question', 'reference_topics'], + ) + + @staticmethod + def toxicity_detector( + threshold: float | None = None, + ) -> EvaluatorDetails: + """Create toxicity-detector evaluator. + + Args: + threshold: float + + Required input fields: text + """ + config = { + k: v for k, v in {"threshold": threshold}.items() + if v is not None + } + return EvaluatorDetails( + slug="toxicity-detector", + config=config if config else None, + required_input_fields=['text'], + ) + + @staticmethod + def uncertainty_detector() -> EvaluatorDetails: + """Create uncertainty-detector evaluator. + + Required input fields: prompt + """ + return EvaluatorDetails( + slug="uncertainty-detector", + required_input_fields=['prompt'], + ) + + @staticmethod + def word_count() -> EvaluatorDetails: + """Create word-count evaluator. + + Required input fields: text + """ + return EvaluatorDetails( + slug="word-count", + required_input_fields=['text'], + ) + + @staticmethod + def word_count_ratio() -> EvaluatorDetails: + """Create word-count-ratio evaluator. + + Required input fields: denominator_text, numerator_text + """ + return EvaluatorDetails( + slug="word-count-ratio", + required_input_fields=['denominator_text', 'numerator_text'], + ) diff --git a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/registry.py b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/registry.py index 56a02362e9..c30767a827 100644 --- a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/registry.py +++ b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/registry.py @@ -16,6 +16,7 @@ AgentGoalAccuracyRequest, AgentGoalCompletenessRequest, AgentToolErrorDetectorRequest, + AgentToolTrajectoryRequest, AnswerCompletenessRequest, AnswerCorrectnessRequest, AnswerRelevancyRequest, @@ -24,6 +25,7 @@ ContextRelevanceRequest, ConversationQualityRequest, FaithfulnessRequest, + HtmlComparisonRequest, InstructionAdherenceRequest, IntentChangeRequest, JSONValidatorRequest, @@ -52,6 +54,7 @@ AgentGoalAccuracyResponse, AgentGoalCompletenessResponse, AgentToolErrorDetectorResponse, + AgentToolTrajectoryResponse, AnswerCompletenessResponse, AnswerCorrectnessResponse, AnswerRelevancyResponse, @@ -60,6 +63,7 @@ ContextRelevanceResponse, ConversationQualityResponse, FaithfulnessResponse, + HtmlComparisonResponse, InstructionAdherenceResponse, IntentChangeResponse, JSONValidatorResponse, @@ -90,6 +94,7 @@ "agent-goal-accuracy": AgentGoalAccuracyRequest, "agent-goal-completeness": AgentGoalCompletenessRequest, "agent-tool-error-detector": AgentToolErrorDetectorRequest, + "agent-tool-trajectory": AgentToolTrajectoryRequest, "answer-completeness": AnswerCompletenessRequest, "answer-correctness": AnswerCorrectnessRequest, "answer-relevancy": AnswerRelevancyRequest, @@ -98,6 +103,7 @@ "context-relevance": ContextRelevanceRequest, "conversation-quality": ConversationQualityRequest, "faithfulness": FaithfulnessRequest, + "html-comparison": HtmlComparisonRequest, "instruction-adherence": InstructionAdherenceRequest, "intent-change": IntentChangeRequest, "json-validator": JSONValidatorRequest, @@ -127,6 +133,7 @@ "agent-goal-accuracy": AgentGoalAccuracyResponse, "agent-goal-completeness": AgentGoalCompletenessResponse, "agent-tool-error-detector": AgentToolErrorDetectorResponse, + "agent-tool-trajectory": AgentToolTrajectoryResponse, "answer-completeness": AnswerCompletenessResponse, "answer-correctness": AnswerCorrectnessResponse, "answer-relevancy": AnswerRelevancyResponse, @@ -135,6 +142,7 @@ "context-relevance": ContextRelevanceResponse, "conversation-quality": ConversationQualityResponse, "faithfulness": FaithfulnessResponse, + "html-comparison": HtmlComparisonResponse, "instruction-adherence": InstructionAdherenceResponse, "intent-change": IntentChangeResponse, "json-validator": JSONValidatorResponse, diff --git a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py index b869224ce6..7e9f9b2e87 100644 --- a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py +++ b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py @@ -1,14 +1,12 @@ # generated by datamodel-codegen: -# filename: tmpvqz8m01b.json +# filename: tmpslpb59ih.json from __future__ import annotations -from typing import Optional - from pydantic import BaseModel, Field -class AgentEfficiencyRequest(BaseModel): +class AgentEfficiencyInput(BaseModel): trajectory_completions: str = Field( ..., examples=['["User found", "Email updated", "Changes saved"]'] ) @@ -17,11 +15,18 @@ class AgentEfficiencyRequest(BaseModel): ) -class AgentFlowQualityRequest(BaseModel): +class AgentEfficiencyRequest(BaseModel): + input: AgentEfficiencyInput + + +class AgentFlowQualityConfigRequest(BaseModel): conditions: list[str] = Field( ..., examples=[['no tools called', 'agent completed task']] ) threshold: float = Field(..., examples=[0.5]) + + +class AgentFlowQualityInput(BaseModel): trajectory_completions: str = Field( ..., examples=['["Found 5 flights", "Selected $299 flight", "Booking confirmed"]'], @@ -34,7 +39,12 @@ class AgentFlowQualityRequest(BaseModel): ) -class AgentGoalAccuracyRequest(BaseModel): +class AgentFlowQualityRequest(BaseModel): + config: AgentFlowQualityConfigRequest + input: AgentFlowQualityInput + + +class AgentGoalAccuracyInput(BaseModel): completion: str = Field( ..., examples=[ @@ -47,8 +57,15 @@ class AgentGoalAccuracyRequest(BaseModel): reference: str = Field(..., examples=['Flight booked: NYC to LA, Monday departure']) -class AgentGoalCompletenessRequest(BaseModel): +class AgentGoalAccuracyRequest(BaseModel): + input: AgentGoalAccuracyInput + + +class AgentGoalCompletenessConfigRequest(BaseModel): threshold: float = Field(..., examples=[0.5]) + + +class AgentGoalCompletenessInput(BaseModel): trajectory_completions: str = Field( ..., examples=['["Account created", "Preferences saved", "Notifications enabled"]'], @@ -59,7 +76,12 @@ class AgentGoalCompletenessRequest(BaseModel): ) -class AgentToolErrorDetectorRequest(BaseModel): +class AgentGoalCompletenessRequest(BaseModel): + config: AgentGoalCompletenessConfigRequest | None = None + input: AgentGoalCompletenessInput + + +class AgentToolErrorDetectorInput(BaseModel): tool_input: str = Field( ..., examples=['{"action": "search", "query": "flights to Paris"}'] ) @@ -71,59 +93,115 @@ class AgentToolErrorDetectorRequest(BaseModel): ) -class AnswerCompletenessRequest(BaseModel): +class AgentToolErrorDetectorRequest(BaseModel): + input: AgentToolErrorDetectorInput + + +class AgentToolTrajectoryConfigRequest(BaseModel): + input_params_sensitive: bool | None = Field(None, examples=[True]) + mismatch_sensitive: bool | None = Field(None, examples=[False]) + order_sensitive: bool | None = Field(None, examples=[False]) + threshold: float | None = Field(None, examples=[0.5]) + + +class AgentToolTrajectoryInput(BaseModel): + executed_tool_calls: str = Field( + ..., examples=['[{"name": "search", "input": {"query": "weather"}}]'] + ) + expected_tool_calls: str = Field( + ..., examples=['[{"name": "search", "input": {"query": "weather"}}]'] + ) + + +class AgentToolTrajectoryRequest(BaseModel): + config: AgentToolTrajectoryConfigRequest | None = None + input: AgentToolTrajectoryInput + + +class AnswerCompletenessInput(BaseModel): completion: str = Field(..., examples=['Paris.']) context: str = Field(..., examples=['The capital of France is Paris.']) question: str = Field(..., examples=['What is the capital of France?']) -class AnswerCorrectnessRequest(BaseModel): +class AnswerCompletenessRequest(BaseModel): + input: AnswerCompletenessInput + + +class AnswerCorrectnessInput(BaseModel): completion: str = Field(..., examples=['World War II ended in 1945.']) ground_truth: str = Field(..., examples=['1945']) question: str = Field(..., examples=['What year did World War II end?']) -class AnswerRelevancyRequest(BaseModel): +class AnswerCorrectnessRequest(BaseModel): + input: AnswerCorrectnessInput + + +class AnswerRelevancyInput(BaseModel): answer: str = Field(..., examples=['The capital of France is Paris.']) question: str = Field(..., examples=['What is the capital of France?']) -class CharCountRatioRequest(BaseModel): +class AnswerRelevancyRequest(BaseModel): + input: AnswerRelevancyInput + + +class CharCountInput(BaseModel): + text: str = Field(..., examples=['Hello, world! This is a sample text.']) + + +class CharCountRatioInput(BaseModel): denominator_text: str = Field( ..., examples=['This is a longer text for comparison'] ) numerator_text: str = Field(..., examples=['Short text']) +class CharCountRatioRequest(BaseModel): + input: CharCountRatioInput + + class CharCountRequest(BaseModel): - text: str = Field(..., examples=['Hello, world! This is a sample text.']) + input: CharCountInput -class ContextRelevanceRequest(BaseModel): +class ContextRelevanceConfigRequest(BaseModel): + model: str | None = Field(None, examples=['gpt-4o']) + + +class ContextRelevanceInput(BaseModel): context: str = Field( ..., examples=[ 'Our store is open Monday to Friday from 9am to 6pm, and Saturday from 10am to 4pm. We are closed on Sundays.' ], ) - model: Optional[str] = Field(None, examples=['gpt-4o']) query: str = Field(..., examples=['What are the business hours?']) -class ConversationQualityRequest(BaseModel): +class ContextRelevanceRequest(BaseModel): + config: ContextRelevanceConfigRequest | None = None + input: ContextRelevanceInput + + +class ConversationQualityInput(BaseModel): completions: str = Field( ..., examples=[ '["Hi! I\'d be happy to assist you today.", "We offer consulting, development, and support services."]' ], ) - model: Optional[str] = Field(None, examples=['gpt-4o']) prompts: str = Field( ..., examples=['["Hello, how can I help?", "What services do you offer?"]'] ) -class FaithfulnessRequest(BaseModel): +class ConversationQualityRequest(BaseModel): + input: ConversationQualityInput + + +class FaithfulnessInput(BaseModel): completion: str = Field( ..., examples=['The Eiffel Tower is located in Paris and was built in 1889.'] ) @@ -136,7 +214,24 @@ class FaithfulnessRequest(BaseModel): question: str = Field(..., examples=['When was the Eiffel Tower built?']) -class InstructionAdherenceRequest(BaseModel): +class FaithfulnessRequest(BaseModel): + input: FaithfulnessInput + + +class HtmlComparisonInput(BaseModel): + html1: str = Field( + ..., examples=['