diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py index 6fa72d0f3d..c8b8339bb9 100644 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py @@ -16,19 +16,11 @@ def _validate_evaluator_input(slug: str, input: Dict[str, str]) -> None: - """Validate input against the evaluator's request model if available. - - Args: - slug: The evaluator slug (e.g., "pii-detector") - input: Dictionary of input field names to values - - Raises: - ValueError: If input fails validation against the request model - """ + """Validate input against the evaluator's request model if available.""" request_model = get_request_model(slug) if request_model: try: - request_model(**input) + request_model(input=input) except ValidationError as e: raise ValueError(f"Invalid input for '{slug}': {e}") from e diff --git a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py index 75c2da1931..3dbf704658 100644 --- a/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py +++ b/packages/traceloop-sdk/traceloop/sdk/evaluator/evaluators_made_by_traceloop.py @@ -15,18 +15,45 @@ def _get_required_fields(slug: str) -> List[str]: model = REQUEST_MODELS.get(slug) if not model: return [] - return [name for name, field in model.model_fields.items() if field.is_required()] + + input_field = model.model_fields.get("input") + if not input_field or not input_field.annotation: + return [] + + input_model = input_field.annotation + if not hasattr(input_model, "model_fields"): + return [] + + return [ + name for name, field in input_model.model_fields.items() + if field.is_required() + ] def _get_config_fields(slug: str) -> dict: - """Get config fields (non-required) with their defaults from the request model.""" + """Get config fields with their defaults from the request model.""" model = REQUEST_MODELS.get(slug) if not model: return {} + + config_field = model.model_fields.get("config") + if not config_field or not config_field.annotation: + return {} + + config_annotation = config_field.annotation + origin = getattr(config_annotation, "__origin__", None) + if origin is not None: + args = getattr(config_annotation, "__args__", ()) + config_model = next((a for a in args if a is not type(None)), None) + else: + config_model = config_annotation + + if not config_model or not hasattr(config_model, "model_fields"): + return {} + config_fields = {} - for name, field in model.model_fields.items(): - if not field.is_required(): - config_fields[name] = field.default + for name, field in config_model.model_fields.items(): + config_fields[name] = field.default return config_fields diff --git a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py index 5241cf6102..42c255a601 100644 --- a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py +++ b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/__init__.py @@ -5,37 +5,82 @@ # ./scripts/generate-models.sh /path/to/swagger.json from .request import ( + AgentEfficiencyInput, AgentEfficiencyRequest, + AgentFlowQualityConfigRequest, + AgentFlowQualityInput, AgentFlowQualityRequest, + AgentGoalAccuracyInput, AgentGoalAccuracyRequest, + AgentGoalCompletenessConfigRequest, + AgentGoalCompletenessInput, AgentGoalCompletenessRequest, + AgentToolErrorDetectorInput, AgentToolErrorDetectorRequest, + AnswerCompletenessInput, AnswerCompletenessRequest, + AnswerCorrectnessInput, AnswerCorrectnessRequest, + AnswerRelevancyInput, AnswerRelevancyRequest, + CharCountInput, + CharCountRatioInput, CharCountRatioRequest, CharCountRequest, + ContextRelevanceConfigRequest, + ContextRelevanceInput, ContextRelevanceRequest, + ConversationQualityConfigRequest, + ConversationQualityInput, ConversationQualityRequest, + FaithfulnessInput, FaithfulnessRequest, + InstructionAdherenceInput, InstructionAdherenceRequest, + IntentChangeConfigRequest, + IntentChangeInput, IntentChangeRequest, + JSONValidatorConfigRequest, + JSONValidatorInput, JSONValidatorRequest, + PIIDetectorConfigRequest, + PIIDetectorInput, PIIDetectorRequest, + PerplexityInput, PerplexityRequest, + PlaceholderRegexConfigRequest, + PlaceholderRegexInput, PlaceholderRegexRequest, + ProfanityDetectorInput, ProfanityDetectorRequest, + PromptInjectionConfigRequest, + PromptInjectionInput, PromptInjectionRequest, + PromptPerplexityInput, PromptPerplexityRequest, + RegexValidatorConfigRequest, + RegexValidatorInput, RegexValidatorRequest, + SQLValidatorInput, SQLValidatorRequest, + SecretsDetectorInput, SecretsDetectorRequest, + SemanticSimilarityInput, SemanticSimilarityRequest, + SexismDetectorConfigRequest, + SexismDetectorInput, SexismDetectorRequest, + ToneDetectionInput, ToneDetectionRequest, + TopicAdherenceInput, TopicAdherenceRequest, + ToxicityDetectorConfigRequest, + ToxicityDetectorInput, ToxicityDetectorRequest, + UncertaintyDetectorInput, UncertaintyDetectorRequest, + WordCountInput, + WordCountRatioInput, WordCountRatioRequest, WordCountRequest, ) @@ -91,37 +136,82 @@ "get_request_model", "get_response_model", # Evaluator request models + "AgentEfficiencyInput", "AgentEfficiencyRequest", + "AgentFlowQualityConfigRequest", + "AgentFlowQualityInput", "AgentFlowQualityRequest", + "AgentGoalAccuracyInput", "AgentGoalAccuracyRequest", + "AgentGoalCompletenessConfigRequest", + "AgentGoalCompletenessInput", "AgentGoalCompletenessRequest", + "AgentToolErrorDetectorInput", "AgentToolErrorDetectorRequest", + "AnswerCompletenessInput", "AnswerCompletenessRequest", + "AnswerCorrectnessInput", "AnswerCorrectnessRequest", + "AnswerRelevancyInput", "AnswerRelevancyRequest", + "CharCountInput", + "CharCountRatioInput", "CharCountRatioRequest", "CharCountRequest", + "ContextRelevanceConfigRequest", + "ContextRelevanceInput", "ContextRelevanceRequest", + "ConversationQualityConfigRequest", + "ConversationQualityInput", "ConversationQualityRequest", + "FaithfulnessInput", "FaithfulnessRequest", + "InstructionAdherenceInput", "InstructionAdherenceRequest", + "IntentChangeConfigRequest", + "IntentChangeInput", "IntentChangeRequest", + "JSONValidatorConfigRequest", + "JSONValidatorInput", "JSONValidatorRequest", + "PIIDetectorConfigRequest", + "PIIDetectorInput", "PIIDetectorRequest", + "PerplexityInput", "PerplexityRequest", + "PlaceholderRegexConfigRequest", + "PlaceholderRegexInput", "PlaceholderRegexRequest", + "ProfanityDetectorInput", "ProfanityDetectorRequest", + "PromptInjectionConfigRequest", + "PromptInjectionInput", "PromptInjectionRequest", + "PromptPerplexityInput", "PromptPerplexityRequest", + "RegexValidatorConfigRequest", + "RegexValidatorInput", "RegexValidatorRequest", + "SQLValidatorInput", "SQLValidatorRequest", + "SecretsDetectorInput", "SecretsDetectorRequest", + "SemanticSimilarityInput", "SemanticSimilarityRequest", + "SexismDetectorConfigRequest", + "SexismDetectorInput", "SexismDetectorRequest", + "ToneDetectionInput", "ToneDetectionRequest", + "TopicAdherenceInput", "TopicAdherenceRequest", + "ToxicityDetectorConfigRequest", + "ToxicityDetectorInput", "ToxicityDetectorRequest", + "UncertaintyDetectorInput", "UncertaintyDetectorRequest", + "WordCountInput", + "WordCountRatioInput", "WordCountRatioRequest", "WordCountRequest", # Evaluator response models diff --git a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py index b869224ce6..2b434e679b 100644 --- a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py +++ b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py @@ -1,5 +1,5 @@ # generated by datamodel-codegen: -# filename: tmpvqz8m01b.json +# filename: tmpp6g09qxu.json from __future__ import annotations @@ -8,7 +8,7 @@ from pydantic import BaseModel, Field -class AgentEfficiencyRequest(BaseModel): +class AgentEfficiencyInput(BaseModel): trajectory_completions: str = Field( ..., examples=['["User found", "Email updated", "Changes saved"]'] ) @@ -17,11 +17,18 @@ class AgentEfficiencyRequest(BaseModel): ) -class AgentFlowQualityRequest(BaseModel): +class AgentEfficiencyRequest(BaseModel): + input: AgentEfficiencyInput + + +class AgentFlowQualityConfigRequest(BaseModel): conditions: list[str] = Field( ..., examples=[['no tools called', 'agent completed task']] ) threshold: float = Field(..., examples=[0.5]) + + +class AgentFlowQualityInput(BaseModel): trajectory_completions: str = Field( ..., examples=['["Found 5 flights", "Selected $299 flight", "Booking confirmed"]'], @@ -34,7 +41,12 @@ class AgentFlowQualityRequest(BaseModel): ) -class AgentGoalAccuracyRequest(BaseModel): +class AgentFlowQualityRequest(BaseModel): + config: AgentFlowQualityConfigRequest + input: AgentFlowQualityInput + + +class AgentGoalAccuracyInput(BaseModel): completion: str = Field( ..., examples=[ @@ -47,8 +59,15 @@ class AgentGoalAccuracyRequest(BaseModel): reference: str = Field(..., examples=['Flight booked: NYC to LA, Monday departure']) -class AgentGoalCompletenessRequest(BaseModel): +class AgentGoalAccuracyRequest(BaseModel): + input: AgentGoalAccuracyInput + + +class AgentGoalCompletenessConfigRequest(BaseModel): threshold: float = Field(..., examples=[0.5]) + + +class AgentGoalCompletenessInput(BaseModel): trajectory_completions: str = Field( ..., examples=['["Account created", "Preferences saved", "Notifications enabled"]'], @@ -59,7 +78,12 @@ class AgentGoalCompletenessRequest(BaseModel): ) -class AgentToolErrorDetectorRequest(BaseModel): +class AgentGoalCompletenessRequest(BaseModel): + config: Optional[AgentGoalCompletenessConfigRequest] = None + input: AgentGoalCompletenessInput + + +class AgentToolErrorDetectorInput(BaseModel): tool_input: str = Field( ..., examples=['{"action": "search", "query": "flights to Paris"}'] ) @@ -71,59 +95,99 @@ class AgentToolErrorDetectorRequest(BaseModel): ) -class AnswerCompletenessRequest(BaseModel): +class AgentToolErrorDetectorRequest(BaseModel): + input: AgentToolErrorDetectorInput + + +class AnswerCompletenessInput(BaseModel): completion: str = Field(..., examples=['Paris.']) context: str = Field(..., examples=['The capital of France is Paris.']) question: str = Field(..., examples=['What is the capital of France?']) -class AnswerCorrectnessRequest(BaseModel): +class AnswerCompletenessRequest(BaseModel): + input: AnswerCompletenessInput + + +class AnswerCorrectnessInput(BaseModel): completion: str = Field(..., examples=['World War II ended in 1945.']) ground_truth: str = Field(..., examples=['1945']) question: str = Field(..., examples=['What year did World War II end?']) -class AnswerRelevancyRequest(BaseModel): +class AnswerCorrectnessRequest(BaseModel): + input: AnswerCorrectnessInput + + +class AnswerRelevancyInput(BaseModel): answer: str = Field(..., examples=['The capital of France is Paris.']) question: str = Field(..., examples=['What is the capital of France?']) -class CharCountRatioRequest(BaseModel): +class AnswerRelevancyRequest(BaseModel): + input: AnswerRelevancyInput + + +class CharCountInput(BaseModel): + text: str = Field(..., examples=['Hello, world! This is a sample text.']) + + +class CharCountRatioInput(BaseModel): denominator_text: str = Field( ..., examples=['This is a longer text for comparison'] ) numerator_text: str = Field(..., examples=['Short text']) +class CharCountRatioRequest(BaseModel): + input: CharCountRatioInput + + class CharCountRequest(BaseModel): - text: str = Field(..., examples=['Hello, world! This is a sample text.']) + input: CharCountInput -class ContextRelevanceRequest(BaseModel): +class ContextRelevanceConfigRequest(BaseModel): + model: Optional[str] = Field(None, examples=['gpt-4o']) + + +class ContextRelevanceInput(BaseModel): context: str = Field( ..., examples=[ 'Our store is open Monday to Friday from 9am to 6pm, and Saturday from 10am to 4pm. We are closed on Sundays.' ], ) - model: Optional[str] = Field(None, examples=['gpt-4o']) query: str = Field(..., examples=['What are the business hours?']) -class ConversationQualityRequest(BaseModel): +class ContextRelevanceRequest(BaseModel): + config: Optional[ContextRelevanceConfigRequest] = None + input: ContextRelevanceInput + + +class ConversationQualityConfigRequest(ContextRelevanceConfigRequest): + pass + + +class ConversationQualityInput(BaseModel): completions: str = Field( ..., examples=[ '["Hi! I\'d be happy to assist you today.", "We offer consulting, development, and support services."]' ], ) - model: Optional[str] = Field(None, examples=['gpt-4o']) prompts: str = Field( ..., examples=['["Hello, how can I help?", "What services do you offer?"]'] ) -class FaithfulnessRequest(BaseModel): +class ConversationQualityRequest(BaseModel): + config: Optional[ConversationQualityConfigRequest] = None + input: ConversationQualityInput + + +class FaithfulnessInput(BaseModel): completion: str = Field( ..., examples=['The Eiffel Tower is located in Paris and was built in 1889.'] ) @@ -136,7 +200,11 @@ class FaithfulnessRequest(BaseModel): question: str = Field(..., examples=['When was the Eiffel Tower built?']) -class InstructionAdherenceRequest(BaseModel): +class FaithfulnessRequest(BaseModel): + input: FaithfulnessInput + + +class InstructionAdherenceInput(BaseModel): instructions: str = Field( ..., examples=['Respond in exactly 3 bullet points and use formal language.'] ) @@ -148,28 +216,51 @@ class InstructionAdherenceRequest(BaseModel): ) -class IntentChangeRequest(BaseModel): +class InstructionAdherenceRequest(BaseModel): + input: InstructionAdherenceInput + + +class IntentChangeConfigRequest(ContextRelevanceConfigRequest): + pass + + +class IntentChangeInput(BaseModel): completions: str = Field( ..., examples=[ '["Sure, I can help with hotel booking", "No problem, let me search for flights"]' ], ) - model: Optional[str] = Field(None, examples=['gpt-4o']) prompts: str = Field( ..., examples=['["I want to book a hotel", "Actually, I need a flight instead"]'], ) -class JSONValidatorRequest(BaseModel): +class IntentChangeRequest(BaseModel): + config: Optional[IntentChangeConfigRequest] = None + input: IntentChangeInput + + +class JSONValidatorConfigRequest(BaseModel): enable_schema_validation: Optional[bool] = Field(None, examples=[True]) schema_string: Optional[str] = Field(None, examples=['{}']) + + +class JSONValidatorInput(BaseModel): text: str = Field(..., examples=['{"name": "John", "age": 30}']) -class PIIDetectorRequest(BaseModel): +class JSONValidatorRequest(BaseModel): + config: Optional[JSONValidatorConfigRequest] = None + input: JSONValidatorInput + + +class PIIDetectorConfigRequest(BaseModel): probability_threshold: Optional[float] = Field(None, examples=[0.8]) + + +class PIIDetectorInput(BaseModel): text: str = Field( ..., examples=[ @@ -178,71 +269,136 @@ class PIIDetectorRequest(BaseModel): ) -class PerplexityRequest(BaseModel): +class PIIDetectorRequest(BaseModel): + config: Optional[PIIDetectorConfigRequest] = None + input: PIIDetectorInput + + +class PerplexityInput(BaseModel): logprobs: str = Field(..., examples=['[-2.3, -1.5, -0.8, -1.2, -0.5]']) -class PlaceholderRegexRequest(BaseModel): +class PerplexityRequest(BaseModel): + input: PerplexityInput + + +class PlaceholderRegexConfigRequest(BaseModel): case_sensitive: Optional[bool] = Field(None, examples=[True]) dot_include_nl: Optional[bool] = Field(None, examples=[True]) multi_line: Optional[bool] = Field(None, examples=[True]) + should_match: Optional[bool] = Field(None, examples=[True]) + + +class PlaceholderRegexInput(BaseModel): placeholder_value: str = Field( ..., examples=['[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}'] ) - should_match: Optional[bool] = Field(None, examples=[True]) text: str = Field(..., examples=['user@example.com']) -class ProfanityDetectorRequest(BaseModel): +class PlaceholderRegexRequest(BaseModel): + config: Optional[PlaceholderRegexConfigRequest] = None + input: PlaceholderRegexInput + + +class ProfanityDetectorInput(BaseModel): text: str = Field(..., examples=['This is a clean and professional message.']) -class PromptInjectionRequest(BaseModel): - prompt: str = Field(..., examples=['What is the weather like today?']) +class ProfanityDetectorRequest(BaseModel): + input: ProfanityDetectorInput + + +class PromptInjectionConfigRequest(BaseModel): threshold: Optional[float] = Field(None, examples=[0.5]) -class PromptPerplexityRequest(BaseModel): +class PromptInjectionInput(BaseModel): + prompt: str = Field(..., examples=['What is the weather like today?']) + + +class PromptInjectionRequest(BaseModel): + config: Optional[PromptInjectionConfigRequest] = None + input: PromptInjectionInput + + +class PromptPerplexityInput(BaseModel): prompt: str = Field(..., examples=['What is the capital of France?']) -class RegexValidatorRequest(BaseModel): +class PromptPerplexityRequest(BaseModel): + input: PromptPerplexityInput + + +class RegexValidatorConfigRequest(BaseModel): case_sensitive: Optional[bool] = Field(None, examples=[True]) dot_include_nl: Optional[bool] = Field(None, examples=[True]) multi_line: Optional[bool] = Field(None, examples=[True]) regex: Optional[str] = Field(None, examples=['.*']) should_match: Optional[bool] = Field(None, examples=[True]) + + +class RegexValidatorInput(BaseModel): text: str = Field(..., examples=['user@example.com']) -class SQLValidatorRequest(BaseModel): +class RegexValidatorRequest(BaseModel): + config: Optional[RegexValidatorConfigRequest] = None + input: RegexValidatorInput + + +class SQLValidatorInput(BaseModel): text: str = Field(..., examples=['SELECT * FROM users WHERE id = 1;']) -class SecretsDetectorRequest(BaseModel): +class SQLValidatorRequest(BaseModel): + input: SQLValidatorInput + + +class SecretsDetectorInput(BaseModel): text: str = Field( ..., examples=['Here is some text without any API keys or passwords.'] ) -class SemanticSimilarityRequest(BaseModel): +class SecretsDetectorRequest(BaseModel): + input: SecretsDetectorInput + + +class SemanticSimilarityInput(BaseModel): completion: str = Field(..., examples=['The cat sat on the mat.']) reference: str = Field(..., examples=['A feline was resting on the rug.']) -class SexismDetectorRequest(BaseModel): +class SemanticSimilarityRequest(BaseModel): + input: SemanticSimilarityInput + + +class SexismDetectorConfigRequest(PromptInjectionConfigRequest): + pass + + +class SexismDetectorInput(BaseModel): text: str = Field( ..., examples=['All team members should be treated equally regardless of gender.'], ) - threshold: Optional[float] = Field(None, examples=[0.5]) -class ToneDetectionRequest(BaseModel): +class SexismDetectorRequest(BaseModel): + config: Optional[SexismDetectorConfigRequest] = None + input: SexismDetectorInput + + +class ToneDetectionInput(BaseModel): text: str = Field(..., examples=['The capital of France is Paris.']) -class TopicAdherenceRequest(BaseModel): +class ToneDetectionRequest(BaseModel): + input: ToneDetectionInput + + +class TopicAdherenceInput(BaseModel): completion: str = Field( ..., examples=[ @@ -255,23 +411,47 @@ class TopicAdherenceRequest(BaseModel): ) -class ToxicityDetectorRequest(BaseModel): +class TopicAdherenceRequest(BaseModel): + input: TopicAdherenceInput + + +class ToxicityDetectorConfigRequest(PromptInjectionConfigRequest): + pass + + +class ToxicityDetectorInput(BaseModel): text: str = Field(..., examples=['Thank you for your help with this project.']) - threshold: Optional[float] = Field(None, examples=[0.5]) -class UncertaintyDetectorRequest(BaseModel): +class ToxicityDetectorRequest(BaseModel): + config: Optional[ToxicityDetectorConfigRequest] = None + input: ToxicityDetectorInput + + +class UncertaintyDetectorInput(BaseModel): prompt: str = Field( ..., examples=['I am not sure, I think the capital of France is Paris.'] ) -class WordCountRatioRequest(BaseModel): +class UncertaintyDetectorRequest(BaseModel): + input: UncertaintyDetectorInput + + +class WordCountInput(BaseModel): + text: str = Field(..., examples=['This is a sample text with several words.']) + + +class WordCountRatioInput(BaseModel): denominator_text: str = Field( ..., examples=['This is a longer input text for comparison'] ) numerator_text: str = Field(..., examples=['Short response']) +class WordCountRatioRequest(BaseModel): + input: WordCountRatioInput + + class WordCountRequest(BaseModel): - text: str = Field(..., examples=['This is a sample text with several words.']) + input: WordCountInput diff --git a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/response.py b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/response.py index 0b75dd715d..9d4a083bb2 100644 --- a/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/response.py +++ b/packages/traceloop-sdk/traceloop/sdk/generated/evaluators/response.py @@ -1,5 +1,5 @@ # generated by datamodel-codegen: -# filename: tmpvqz8m01b.json +# filename: tmpp6g09qxu.json from __future__ import annotations