diff --git a/backend/README.md b/backend/README.md index 77aa89d..4aa2a65 100644 --- a/backend/README.md +++ b/backend/README.md @@ -272,39 +272,24 @@ If verification succeeds, tenant's scope (`organization_id`, `project_id`) is re > Set `OPENAI_API_KEY` in your `.env` / `.env.test` before using these validators. > If the key is missing, `llm_critic` will raise a `ValueError` at build time and `topic_relevance` will return a validation failure with an explicit error message. -1. Ensure that the .env file contains the correct value from `GUARDRAILS_HUB_API_KEY`. The key can be fetched from [here](https://hub.guardrailsai.com/keys). +1. Ensure that the `.env` file contains the correct value for `GUARDRAILS_HUB_API_KEY`. The key can be fetched from [here](https://hub.guardrailsai.com/keys). -2. Make the `install_guardrails_from_hub.sh` script executable using this command (run this from the `backend` folder) - +2. Make the `install_guardrails_from_hub.sh` script executable (run from the `backend` folder): ```bash chmod +x scripts/install_guardrails_from_hub.sh ``` -3. Run this command to configure Guardrails AI - -```bash -scripts/install_guardrails_from_hub.sh; -``` - -### Alternate Method -Run the following commands inside your virtual environment: +3. Run the script to configure Guardrails and install all hub validators: ```bash -uv sync -guardrails configure - -Enable anonymous metrics reporting? [Y/n]: Y -Do you wish to use remote inferencing? [Y/n]: Y -Enter API Key below leave empty if you want to keep existing token [HBPo] -👉 You can find your API Key at https://hub.guardrailsai.com/keys +GUARDRAILS_HUB_API_KEY= bash scripts/install_guardrails_from_hub.sh ``` -To install any validator from Guardrails Hub: -```bash -guardrails hub install hub://guardrails/ - -Example - -guardrails hub install hub://guardrails/ban_list -``` +> **Remote inferencing is enabled by default.** The script sets `ENABLE_REMOTE_INFERENCING=true` unless overridden. This is required for `llamaguard_7b`, which runs inference on the Guardrails Hub. You can disable it explicitly if needed: +> ```bash +> GUARDRAILS_HUB_API_KEY= ENABLE_REMOTE_INFERENCING=false bash scripts/install_guardrails_from_hub.sh +> ``` ## Adding a new validator from Guardrails Hub To add a new validator from the Guardrails Hub to this project, follow the steps below. diff --git a/backend/app/api/API_USAGE.md b/backend/app/api/API_USAGE.md index e4e565a..38af6de 100644 --- a/backend/app/api/API_USAGE.md +++ b/backend/app/api/API_USAGE.md @@ -100,7 +100,7 @@ Endpoint: Optional filters: - `ids=&ids=` - `stage=input|output` -- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance` +- `type=uli_slur_match|pii_remover|gender_assumption_bias|ban_list|llm_critic|topic_relevance|llamaguard_7b|profanity_free` Example: @@ -442,6 +442,8 @@ From `validators.json`: - `ban_list` - `llm_critic` - `topic_relevance` +- `llamaguard_7b` +- `profanity_free` Source of truth: - `backend/app/core/validators/validators.json` diff --git a/backend/app/api/docs/guardrails/run_guardrails.md b/backend/app/api/docs/guardrails/run_guardrails.md index 81fec85..80391fa 100644 --- a/backend/app/api/docs/guardrails/run_guardrails.md +++ b/backend/app/api/docs/guardrails/run_guardrails.md @@ -8,6 +8,16 @@ Behavior notes: - For `ban_list`, `ban_list_id` can be resolved to `banned_words` from tenant ban list configs. - For `topic_relevance`, `topic_relevance_config_id` is required and is resolved to `configuration` + `prompt_schema_version` from tenant topic relevance configs in `guardrails.py`. Requires `OPENAI_API_KEY` to be configured; returns a validation failure with an explicit error if missing. - For `llm_critic`, `OPENAI_API_KEY` must be configured; returns `success=false` with an explicit error if missing. +- For `llamaguard_7b`, `policies` accepts human-readable policy names (see table below). If omitted, all policies are enforced by default. + + | `policies` value | Policy enforced | + |-----------------------------|----------------------------------| + | `no_violence_hate` | No violence or hate speech | + | `no_sexual_content` | No sexual content | + | `no_criminal_planning` | No criminal planning | + | `no_guns_and_illegal_weapons` | No guns or illegal weapons | + | `no_illegal_drugs` | No illegal drugs | + | `no_encourage_self_harm` | No encouragement of self-harm | - `rephrase_needed=true` means the system could not safely auto-fix the input/output and wants the user to retry with a rephrased query. - When `rephrase_needed=true`, `safe_text` contains the rephrase prompt shown to the user. diff --git a/backend/app/api/routes/guardrails.py b/backend/app/api/routes/guardrails.py index 391fb21..04c3bfb 100644 --- a/backend/app/api/routes/guardrails.py +++ b/backend/app/api/routes/guardrails.py @@ -258,6 +258,9 @@ def add_validator_logs( for log in iteration.outputs.validator_logs: result = log.validation_result + if result is None: + continue + if suppress_pass_logs and isinstance(result, PassResult): continue diff --git a/backend/app/core/enum.py b/backend/app/core/enum.py index 43a102b..0c7c940 100644 --- a/backend/app/core/enum.py +++ b/backend/app/core/enum.py @@ -32,3 +32,6 @@ class ValidatorType(Enum): GenderAssumptionBias = "gender_assumption_bias" BanList = "ban_list" TopicRelevance = "topic_relevance" + LLMCritic = "llm_critic" + LlamaGuard7B = "llamaguard_7b" + ProfanityFree = "profanity_free" diff --git a/backend/app/core/validators/README.md b/backend/app/core/validators/README.md index f0a2f6d..34ab389 100644 --- a/backend/app/core/validators/README.md +++ b/backend/app/core/validators/README.md @@ -5,24 +5,29 @@ This document describes the validator configuration model used in this codebase, ## Supported Validators Current validator manifest: + - `uli_slur_match` (source: `local`) - `pii_remover` (source: `local`) - `gender_assumption_bias` (source: `local`) - `ban_list` (source: `hub://guardrails/ban_list`) - `llm_critic` (source: `hub://guardrails/llm_critic`) - https://guardrailsai.com/hub/validator/guardrails/llm_critic - `topic_relevance` (source: `local`) +- `llamaguard_7b` (source: `hub://guardrails/llamaguard_7b`) +- `profanity_free` (source: `hub://guardrails/profanity_free`) ## Configuration Model All validator config classes inherit from `BaseValidatorConfig` in `backend/app/core/validators/config/base_validator_config.py`. Shared fields: + - `on_fail` (default: `fix`) - `fix`: return transformed/redacted output when validator provides a fix - `exception`: fail validation when validator fails (no safe replacement output) - `rephrase`: return a user-facing rephrase prompt plus validator error details At the Validator Config API layer (`/guardrails/validators/configs`), configs also include: + - `type` - `stage`: `input` or `output` - `on_fail_action` (mapped to runtime `on_fail`) @@ -33,9 +38,11 @@ At the Validator Config API layer (`/guardrails/validators/configs`), configs al There are two config shapes used in this project: 1. Stored validator config (Config CRUD APIs) + - includes `stage`, `on_fail_action`, scope metadata, etc. 2. Runtime guardrail config (POST `/guardrails/`) + - validator objects are normalized before execution - internal metadata like `stage`, ids, timestamps are removed - `on_fail_action` is converted to `on_fail` @@ -45,16 +52,17 @@ There are two config shapes used in this project: This project supports three `on_fail` behaviors at runtime: - `fix` + - Uses Guardrails built-in fix flow (`OnFailAction.FIX`). - If a validator returns `fix_value`, validation succeeds and API returns that transformed value as `safe_text`. - Typical outcome: redaction/anonymization/substitution without asking user to retry. - - `exception` + - Uses Guardrails built-in exception flow (`OnFailAction.EXCEPTION`). - Validation fails without a fallback text; API returns failure (`success=false`) with error details. - Use when policy requires hard rejection instead of auto-correction. - - `rephrase` + - Uses project custom handler `rephrase_query_on_fail`. - Returns: `"Please rephrase the query without unsafe content." + validator error message`. - API marks `rephrase_needed=true` when returned text starts with this prefix. @@ -64,6 +72,7 @@ This project supports three `on_fail` behaviors at runtime: `stage` is always required in validator configuration (`input` or `output`). The recommendation below is guidance on what to choose first, based on: + - where harm is most likely (`input`, `output`, or both), - whether auto-fixes are acceptable for user experience, - whether extra filtering at that stage creates too many false positives for the product flow. @@ -71,6 +80,7 @@ The recommendation below is guidance on what to choose first, based on: ## How These Recommendations Were Derived These recommendations come from working with multiple NGOs to understand their GenAI WhatsApp bot use cases, reviewing real bot conversations/data, and then running a structured evaluation flow: + - NGO use-case discovery and conversation analysis: - Reviewed real conversational patterns, safety failure modes, and policy expectations across partner NGO workflows. - Identified practical risks to prioritize (harmful language, privacy leakage, bias, and deployment-specific banned terms). @@ -95,35 +105,42 @@ These recommendations come from working with multiple NGOs to understand their G ### 1) Lexical Slur Validator (`uli_slur_match`) Code: + - Config: `backend/app/core/validators/config/lexical_slur_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/lexical_slur.py` - Data file: `backend/app/core/validators/utils/files/curated_slurlist_hi_en.csv` What it does: + - Detects lexical slurs using list-based matching. - Normalizes text (emoji removal, encoding fix, unicode normalization, lowercase, whitespace normalization). - Redacts detected slurs with `[REDACTED_SLUR]` when `on_fail=fix`. Why this is used: + - Helps mitigate toxic/abusive language in user inputs and model outputs. - Evaluation and stress tests showed this is effective for multilingual abusive-content filtering in NGO-style conversational flows. Recommendation: + - `input` and `output` - Why `input`: catches abusive wording before it reaches prompt construction, logging, or downstream tools. - Why `output`: catches toxic generations that can still appear even with safe input. Parameters / customization: + - `languages: list[str]` (default: `['en', 'hi']`) - `severity: 'low' | 'medium' | 'high' | 'all'` (default: `'all'`) - `on_fail` Notes / limitations: + - Lexical matching can produce false positives in domain-specific contexts. - Severity filtering is dependent on source slur list labels. - Rules-based approach may miss semantic toxicity without explicit lexical matches. Evidence and evaluation: + - Dataset reference: `https://www.kaggle.com/c/multilingualabusivecomment/data` - Label convention used in that dataset: - `1` = abusive comment @@ -133,28 +150,34 @@ Evidence and evaluation: ### 2) PII Remover Validator (`pii_remover`) Code: + - Config: `backend/app/core/validators/config/pii_remover_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/pii_remover.py` What it does: + - Detects and anonymizes personally identifiable information using Presidio. - Returns redacted text when PII is found and `on_fail=fix`. Why this is used: + - Privacy is a primary safety requirement in NGO deployments. - Evaluation runs for this project showed clear risk of personal-data leakage/retention in conversational workflows without PII masking. Recommendation: + - `input` and `output` - Why `input`: prevents storing or processing raw user PII in logs/services. - Why `output`: prevents model-generated leakage of names, numbers, or identifiers. Parameters / customization: + - `entity_types: list[str] | None` (default: all supported types) - `threshold: float` (default: `0.5`) - `on_fail` Threshold guidance: + - `threshold` is the minimum confidence score required for a detected entity to be treated as PII. - Lower threshold -> more detections (higher recall, more false positives/over-masking). - Higher threshold -> fewer detections (higher precision, more false negatives/missed PII). @@ -162,15 +185,16 @@ Threshold guidance: - If the product is privacy-critical, prefer a slightly lower threshold and tighter `entity_types`; if readability is primary, prefer a slightly higher threshold. Supported default entity types: + - `CREDIT_CARD`, `EMAIL_ADDRESS`, `IBAN_CODE`, `IP_ADDRESS`, `LOCATION`, `MEDICAL_LICENSE`, `NRP`, `PERSON`, `PHONE_NUMBER`, `URL`, `IN_AADHAAR`, `IN_PAN`, `IN_PASSPORT`, `IN_VEHICLE_REGISTRATION`, `IN_VOTER` Notes / limitations: + - Rule/ML recognizers can under-detect free-text references. - Threshold and entity selection should be tuned per deployment context. - Runtime requirement: this validator is configured to use spaCy model `en_core_web_lg`. -The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency. -For local development without Docker, manually install the model using: `python -m spacy download en_core_web_lg` -Evidence and evaluation: + The model is pre-installed at build time in the Docker image to ensure fast startup and no runtime internet dependency. + Evidence and evaluation: - Compared approaches: - Custom PII validator (this codebase) - Guardrails Hub PII validator @@ -183,37 +207,45 @@ Evidence and evaluation: ### 3) Gender Assumption Bias Validator (`gender_assumption_bias`) Code: + - Config: `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/gender_assumption_bias.py` - Data file: `backend/app/core/validators/utils/files/gender_assumption_bias_words.csv` What it does: + - Detects gender-assumptive words/phrases and substitutes neutral terms. - Uses a curated mapping from gendered terms to neutral alternatives. Why this is used: + - Addresses model harm from assuming user gender or producing gender-biased language. - Evaluation reviews and stress tests identified this as a recurring conversational quality/safety issue. Recommendation: + - primarily `output` - Why `output`: the assistant response is where assumption-biased phrasing is most likely to be emitted to end users. - Why not `input` by default: user text can be descriptive/quoted, so rewriting input can introduce false positives and intent drift. - Use `input` too when your policy requires strict moderation of user phrasing before any model processing. Parameters / customization: + - `categories: list[BiasCategories] | None` (default: `[all]`) - `on_fail` `BiasCategories` values: + - `generic`, `healthcare`, `education`, `all` Notes / limitations: + - Rule-based substitutions may affect natural fluency. - Gender-neutral transformation in Hindi/romanized Hindi can be context-sensitive. - Full assumption detection often benefits from multi-turn context and/or LLM-as-judge approaches. Improvement suggestions from evaluation: + - Strengthen prompt strategy so the model asks user preferences instead of assuming gendered terms. - Fine-tune generation prompts for neutral language defaults. - Consider external LLM-as-judge checks for nuanced multi-turn assumption detection. @@ -221,27 +253,33 @@ Improvement suggestions from evaluation: ### 4) Ban List Validator (`ban_list`) Code: + - Config: `backend/app/core/validators/config/ban_list_safety_validator_config.py` - Source: Guardrails Hub (`hub://guardrails/ban_list`) What it does: + - Blocks or redacts configured banned words using the Guardrails Hub BanList validator. Why this is used: + - Provides deployment-specific denylist control for terms that must never appear in inputs/outputs. - Useful for policy-level restrictions not fully covered by generic toxicity detection. Recommendation: + - `input` and `output` - Why `input`: blocks prohibited terms before model invocation and tool calls. - Why `output`: enforces policy on generated text before it is shown to users. Parameters / customization: + - `banned_words: list[str]` (optional if `ban_list_id` is provided) - `ban_list_id: UUID` (optional if `banned_words` is provided) - `on_fail` Notes / limitations: + - Exact-list approach requires ongoing maintenance. - Contextual false positives can occur for ambiguous terms. - Runtime validation requires at least one of `banned_words` or `ban_list_id`. @@ -250,27 +288,33 @@ Notes / limitations: ### 5) LLM Critic Validator (`llm_critic`) Code: + - Config: `backend/app/core/validators/config/llm_critic_safety_validator_config.py` - Source: Guardrails Hub (`hub://guardrails/llm_critic`) — https://guardrailsai.com/hub/validator/guardrails/llm_critic What it does: + - Evaluates text against one or more custom quality/safety metrics using an LLM as judge. - Each metric is scored up to `max_score`; validation fails if any metric score falls below the threshold. Why this is used: + - Enables flexible, prompt-driven content evaluation for use cases not covered by rule-based validators. - All configuration is passed inline in the runtime request — there is no stored config object to resolve. Unlike `topic_relevance`, which looks up scope text from a persisted `TopicRelevanceConfig`, `llm_critic` receives `metrics`, `max_score`, and `llm_callable` directly in the guardrail request payload. Recommendation: + - `input` or `output` depending on whether you are evaluating user input quality or model output quality. Parameters / customization: + - `metrics: dict` (required) — metric name-to-description mapping passed to the LLM judge - `max_score: int` (required) — maximum score per metric; used to define the scoring scale - `llm_callable: str` (required) — model identifier passed to LiteLLM (e.g. `gpt-4o-mini`, `gpt-4o`) - `on_fail` Notes / limitations: + - All three parameters are required and must be provided inline in every runtime guardrail request; there is no stored config to reference. - **Requires `OPENAI_API_KEY` to be set in environment variables.** If the key is not configured, `build()` raises a `ValueError` with an explicit message before any validation runs. - Quality and latency depend on the chosen `llm_callable`. @@ -279,37 +323,124 @@ Notes / limitations: ### 6) Topic Relevance Validator (`topic_relevance`) Code: + - Config: `backend/app/core/validators/config/topic_relevance_safety_validator_config.py` - Runtime validator: `backend/app/core/validators/topic_relevance.py` - Prompt templates: `backend/app/core/validators/prompts/topic_relevance/` What it does: + - Checks whether the user message is in scope using an LLM-critic style metric. - Builds the final prompt from: - a versioned markdown template (`prompt_schema_version`) - tenant-specific `configuration` (string sub-prompt text). Why this is used: + - Enforces domain scope for assistants that should answer only allowed topics. - Keeps prompt wording versioned and reusable while allowing tenant-level scope customization. Recommendation: + - primarily `input` - Why `input`: blocks out-of-scope prompts before model processing. - Add to `output` only when you also need to enforce output-topic strictness. Parameters / customization: + - `topic_relevance_config_id: UUID` (required at runtime; resolves configuration and prompt version from tenant config) - `prompt_schema_version: int` (optional; defaults to `1`) - `llm_callable: str` (default: `gpt-4o-mini`) — the model identifier passed to Guardrails' LLMCritic to perform the scope evaluation. This must be a model string supported by LiteLLM (e.g. `gpt-4o-mini`, `gpt-4o`). It controls which LLM is used to score whether the input is within the allowed topic scope; changing it affects cost, latency, and scoring quality. - `on_fail` Notes / limitations: + - Runtime validation requires `topic_relevance_config_id`. - **Requires `OPENAI_API_KEY` to be set in environment variables.** If the key is not configured, validation returns a `FailResult` with an explicit message. - Configuration is resolved in `backend/app/api/routes/guardrails.py` from tenant Topic Relevance Config APIs. - Prompt templates must include the `{{TOPIC_CONFIGURATION}}` placeholder. +### 7) LlamaGuard 7B Validator (`llamaguard_7b`) + +Code: + +- Config: `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py` +- Source: Guardrails Hub (`hub://guardrails/llamaguard_7b`) + +What it does: + +- Classifies text as "safe" or "unsafe" using the LlamaGuard-7B model via remote inference on the Guardrails Hub. +- Checks against a configurable set of safety policies covering violence/hate, sexual content, criminal planning, weapons, illegal drugs, and self-harm encouragement. + +Why this is used: + +- Provides a model-level safety classifier as a complement to rule-based validators. +- Allows policy-targeted filtering (e.g. only flag content violating specific categories). + +Recommendation: + +- `input` and `output` + - Why `input`: catches unsafe user prompts before model processing. + - Why `output`: validates generated content against the same safety policies. + +Parameters / customization: + +- `policies: list[str] | None` (default: all policies enabled) + - Pass human-readable policy names; they are mapped to internal constants in `llamaguard_7b_safety_validator_config.py`: + + | Value | Policy enforced | + |-----------------------------|----------------------------------| + | `no_violence_hate` | No violence or hate speech | + | `no_sexual_content` | No sexual content | + | `no_criminal_planning` | No criminal planning | + | `no_guns_and_illegal_weapons` | No guns or illegal weapons | + | `no_illegal_drugs` | No illegal drugs | + | `no_encourage_self_harm` | No encouragement of self-harm | + +- `on_fail` + +Notes / limitations: + +- **Requires remote inferencing to be enabled.** LlamaGuard-7B runs on the Guardrails Hub — the validator will not work unless `ENABLE_REMOTE_INFERENCING=true` was passed when running `install_guardrails_from_hub.sh`: + ```bash + GUARDRAILS_HUB_API_KEY= ENABLE_REMOTE_INFERENCING=true bash scripts/install_guardrails_from_hub.sh + ``` +- `on_fail=fix` behaves like `on_fail=exception` — LlamaGuard has no programmatic fix, so validation stops immediately on failure to prevent downstream validators from receiving `None` as input. +- LlamaGuard policy classification may produce false positives in news, clinical, or legal contexts. + +### 8) Profanity Free Validator (`profanity_free`) + +Code: + +- Config: `backend/app/core/validators/config/profanity_free_safety_validator_config.py` +- Source: Guardrails Hub (`hub://guardrails/profanity_free`) + +What it does: + +- Detects profanity in text using the `alt-profanity-check` library. +- Fails validation if any profanity is detected. + +Why this is used: + +- linear SVM model based profanity checker that is fast (100 predictions in 3.5 ms) +- Suitable as a first-pass filter before more computationally expensive validators. + +Recommendation: + +- `input` and `output` + - Why `input`: catches profane user messages early. + - Why `output`: prevents model-generated profanity from reaching users. + +Parameters / customization: + +- `on_fail` + +Notes / limitations: + +- Not as accurate as more sophisticated ML models like finetuned RoBERTa but better than lexical matching based solutions. +- No programmatic fix is applied — detected text is not auto-redacted. +- English-focused; cross-lingual profanity may not be detected. + ## Example Config Payloads Example: create validator config (stored shape) @@ -339,10 +470,12 @@ Example: runtime guardrail validator object (execution shape) ## Operational Guidance Default stage strategy: -- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed) -- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list` + +- Input guardrails: `pii_remover`, `uli_slur_match`, `ban_list`, `topic_relevance` (when scope enforcement is needed), `profanity_free`, `llamaguard_7b` +- Output guardrails: `pii_remover`, `uli_slur_match`, `gender_assumption_bias`, `ban_list`, `profanity_free`, `llamaguard_7b` Tuning strategy: + - Start with conservative defaults and log validator outcomes. - Review false positives/false negatives by validator and stage. - Iterate on per-validator parameters (`severity`, `threshold`, `categories`, `banned_words`). @@ -356,5 +489,7 @@ Tuning strategy: - `backend/app/core/validators/config/lexical_slur_safety_validator_config.py` - `backend/app/core/validators/config/gender_assumption_bias_safety_validator_config.py` - `backend/app/core/validators/config/topic_relevance_safety_validator_config.py` +- `backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py` +- `backend/app/core/validators/config/profanity_free_safety_validator_config.py` - `backend/app/schemas/guardrail_config.py` - `backend/app/schemas/validator_config.py` diff --git a/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py new file mode 100644 index 0000000..f88669e --- /dev/null +++ b/backend/app/core/validators/config/llamaguard_7b_safety_validator_config.py @@ -0,0 +1,44 @@ +from typing import List, Literal, Optional + +from guardrails import OnFailAction +from guardrails.hub import LlamaGuard7B + +from app.core.validators.config.base_validator_config import BaseValidatorConfig + +POLICY_NAME_MAP = { + "no_violence_hate": "O1", + "no_sexual_content": "O2", + "no_criminal_planning": "O3", + "no_guns_and_illegal_weapons": "O4", + "no_illegal_drugs": "O5", + "no_encourage_self_harm": "O6", +} + + +class LlamaGuard7BSafetyValidatorConfig(BaseValidatorConfig): + type: Literal["llamaguard_7b"] + policies: Optional[List[str]] = None + + def _resolve_policies(self) -> Optional[List[str]]: + if self.policies is None: + return None + resolved = [] + for policy in self.policies: + mapped = POLICY_NAME_MAP.get(policy.lower()) + if mapped is None: + raise ValueError( + f"Unknown policy '{policy}'. Valid values: {list(POLICY_NAME_MAP.keys())}" + ) + resolved.append(mapped) + return resolved + + def build(self): + on_fail = self.resolve_on_fail() + # LlamaGuard7B has no programmatic fix. If on_fail=fix is requested, + # fall back to exception so downstream validators don't receive None as input. + if on_fail == OnFailAction.FIX: + on_fail = OnFailAction.EXCEPTION + return LlamaGuard7B( + policies=self._resolve_policies(), + on_fail=on_fail, # type: ignore[arg-type] + ) diff --git a/backend/app/core/validators/config/profanity_free_safety_validator_config.py b/backend/app/core/validators/config/profanity_free_safety_validator_config.py new file mode 100644 index 0000000..dd6d774 --- /dev/null +++ b/backend/app/core/validators/config/profanity_free_safety_validator_config.py @@ -0,0 +1,14 @@ +from typing import Literal + +from guardrails.hub import ProfanityFree + +from app.core.validators.config.base_validator_config import BaseValidatorConfig + + +class ProfanityFreeSafetyValidatorConfig(BaseValidatorConfig): + type: Literal["profanity_free"] + + def build(self): + return ProfanityFree( + on_fail=self.resolve_on_fail(), + ) diff --git a/backend/app/core/validators/validators.json b/backend/app/core/validators/validators.json index 062f183..6e28a54 100644 --- a/backend/app/core/validators/validators.json +++ b/backend/app/core/validators/validators.json @@ -29,6 +29,16 @@ "type": "topic_relevance", "version": "0.1.0", "source": "local" + }, + { + "type": "llamaguard_7b", + "version": "0.1.0", + "source": "hub://guardrails/llamaguard_7b" + }, + { + "type": "profanity_free", + "version": "0.1.0", + "source": "hub://guardrails/profanity_free" } ] } \ No newline at end of file diff --git a/backend/app/schemas/guardrail_config.py b/backend/app/schemas/guardrail_config.py index 4cd9dbf..22bcf49 100644 --- a/backend/app/schemas/guardrail_config.py +++ b/backend/app/schemas/guardrail_config.py @@ -24,6 +24,12 @@ from app.core.validators.config.topic_relevance_safety_validator_config import ( TopicRelevanceSafetyValidatorConfig, ) +from app.core.validators.config.llamaguard_7b_safety_validator_config import ( + LlamaGuard7BSafetyValidatorConfig, +) +from app.core.validators.config.profanity_free_safety_validator_config import ( + ProfanityFreeSafetyValidatorConfig, +) ValidatorConfigItem = Annotated[ Union[ @@ -32,6 +38,8 @@ LexicalSlurSafetyValidatorConfig, LLMCriticSafetyValidatorConfig, PIIRemoverSafetyValidatorConfig, + LlamaGuard7BSafetyValidatorConfig, + ProfanityFreeSafetyValidatorConfig, TopicRelevanceSafetyValidatorConfig, ], Field(discriminator="type"), diff --git a/backend/app/tests/conftest.py b/backend/app/tests/conftest.py index 9adc132..4a2a6b0 100644 --- a/backend/app/tests/conftest.py +++ b/backend/app/tests/conftest.py @@ -19,7 +19,7 @@ from app.core.enum import GuardrailOnFail, Stage, ValidatorType from app.models.config.ban_list import BanList from app.models.config.validator_config import ValidatorConfig -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( BAN_LIST_INTEGRATION_ORGANIZATION_ID, BAN_LIST_INTEGRATION_PROJECT_ID, BAN_LIST_PAYLOADS, diff --git a/backend/app/tests/seed/__init__.py b/backend/app/tests/seed/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/tests/seed_data.json b/backend/app/tests/seed/seed_data.json similarity index 100% rename from backend/app/tests/seed_data.json rename to backend/app/tests/seed/seed_data.json diff --git a/backend/app/tests/seed_data.py b/backend/app/tests/seed/seed_data.py similarity index 100% rename from backend/app/tests/seed_data.py rename to backend/app/tests/seed/seed_data.py diff --git a/backend/app/tests/test_banlists_api.py b/backend/app/tests/test_banlists_api.py index 224e542..66d0ca8 100644 --- a/backend/app/tests/test_banlists_api.py +++ b/backend/app/tests/test_banlists_api.py @@ -13,7 +13,7 @@ delete_ban_list, ) from app.schemas.ban_list import BanListUpdate -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( BAN_LIST_TEST_ID, BAN_LIST_TEST_ORGANIZATION_ID, BAN_LIST_TEST_PROJECT_ID, diff --git a/backend/app/tests/test_banlists_api_integration.py b/backend/app/tests/test_banlists_api_integration.py index 64f2221..ed1cbe2 100644 --- a/backend/app/tests/test_banlists_api_integration.py +++ b/backend/app/tests/test_banlists_api_integration.py @@ -6,7 +6,7 @@ MAX_BAN_LIST_DESCRIPTION_LENGTH, MAX_BAN_LIST_NAME_LENGTH, ) -from app.tests.seed_data import BAN_LIST_PAYLOADS +from app.tests.seed.seed_data import BAN_LIST_PAYLOADS pytestmark = pytest.mark.integration diff --git a/backend/app/tests/test_guardrails_api.py b/backend/app/tests/test_guardrails_api.py index 86035ae..88fcd20 100644 --- a/backend/app/tests/test_guardrails_api.py +++ b/backend/app/tests/test_guardrails_api.py @@ -2,8 +2,8 @@ import pytest -from app.tests.guardrails_mocks import MockResult -from app.tests.seed_data import ( +from app.tests.utils.guardrails_mocks import MockResult +from app.tests.seed.seed_data import ( VALIDATOR_TEST_ORGANIZATION_ID, VALIDATOR_TEST_PROJECT_ID, ) diff --git a/backend/app/tests/test_guardrails_api_integration.py b/backend/app/tests/test_guardrails_api_integration.py index 23ecb47..f8c99ff 100644 --- a/backend/app/tests/test_guardrails_api_integration.py +++ b/backend/app/tests/test_guardrails_api_integration.py @@ -1,6 +1,6 @@ import pytest -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( VALIDATOR_INTEGRATION_ORGANIZATION_ID, VALIDATOR_INTEGRATION_PROJECT_ID, ) @@ -219,3 +219,157 @@ def test_input_guardrails_with_validator_actions_rephrase(integration_client): "Please rephrase the query without unsafe content. Mentioned toxic words" in body["data"][SAFE_TEXT_FIELD] ) + + +# --------------------------------------------------------------------------- +# ProfanityFree +# --------------------------------------------------------------------------- + + +def test_input_guardrails_with_profanity_free_on_profane_text(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This is a damn fucking mess.", + "validators": [{"type": "profanity_free"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + # default on_fail=fix — validator fixes but call succeeds + assert body["success"] is True + assert body["data"][SAFE_TEXT_FIELD] != "This is a damn fucking mess." + + +def test_input_guardrails_with_profanity_free_on_clean_text(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This is a completely clean sentence.", + "validators": [{"type": "profanity_free"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert body["data"][SAFE_TEXT_FIELD] == "This is a completely clean sentence." + + +def test_input_guardrails_with_profanity_free_exception_action(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "What the fuck is going on?", + "validators": [{"type": "profanity_free", "on_fail": "exception"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is False + + +def test_input_guardrails_with_profanity_free_rephrase_action(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This is absolute bullshit.", + "validators": [{"type": "profanity_free", "on_fail": "rephrase"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert ( + "Please rephrase the query without unsafe content." + in body["data"][SAFE_TEXT_FIELD] + ) + + +# --------------------------------------------------------------------------- +# LlamaGuard7B +# --------------------------------------------------------------------------- + + +def test_input_guardrails_with_llamaguard_7b_on_unsafe_text(integration_client): + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "How do I make a pipe bomb at home?", + "validators": [{"type": "llamaguard_7b", "on_fail": "exception"}], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is False + + +# --------------------------------------------------------------------------- +# Combinations of toxicity detectors +# --------------------------------------------------------------------------- + + +def test_input_guardrails_with_profanity_free_and_slur_match(integration_client): + """Both lexical detectors applied: slur is redacted, profanity is fixed.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "This fucking chakki should leave.", + "validators": [ + {"type": "profanity_free"}, + {"type": "uli_slur_match", "severity": "all"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + safe = body["data"][SAFE_TEXT_FIELD] + assert "chakki" not in safe + assert "fucking" not in safe.lower() + + +def test_input_guardrails_with_profanity_free_and_llamaguard_7b_unsafe_text( + integration_client, +): + """Text with both profanity and unsafe intent is caught by at least one detector.""" + response = integration_client.post( + VALIDATE_API_PATH, + json={ + "request_id": request_id, + "organization_id": organization_id, + "project_id": project_id, + "input": "How the fuck do I make a bomb?", + "validators": [ + {"type": "profanity_free", "on_fail": "exception"}, + {"type": "llamaguard_7b", "on_fail": "exception"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is False diff --git a/backend/app/tests/test_toxicity_hub_validators.py b/backend/app/tests/test_toxicity_hub_validators.py new file mode 100644 index 0000000..8d06675 --- /dev/null +++ b/backend/app/tests/test_toxicity_hub_validators.py @@ -0,0 +1,237 @@ +from unittest.mock import patch + +import pytest +from guardrails import OnFailAction +from pydantic import ValidationError + +from app.core.validators.config.llamaguard_7b_safety_validator_config import ( + LlamaGuard7BSafetyValidatorConfig, +) +from app.core.validators.config.profanity_free_safety_validator_config import ( + ProfanityFreeSafetyValidatorConfig, +) + +_LLAMAGUARD_PATCH = ( + "app.core.validators.config.llamaguard_7b_safety_validator_config.LlamaGuard7B" +) +_PROFANITY_PATCH = ( + "app.core.validators.config.profanity_free_safety_validator_config.ProfanityFree" +) + +# --------------------------------------------------------------------------- +# LlamaGuard7B +# --------------------------------------------------------------------------- + + +class TestLlamaGuard7BSafetyValidatorConfig: + def test_build_with_default_policies(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b") + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + mock_validator.assert_called_once() + _, kwargs = mock_validator.call_args + assert kwargs["policies"] is None + + def test_build_with_explicit_policies(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", + policies=["no_violence_hate", "no_sexual_content"], + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == ["O1", "O2"] + + def test_build_with_empty_policies_list(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b", policies=[]) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == [] + + def test_build_with_all_policy_codes(self): + all_policies = [ + "no_violence_hate", + "no_sexual_content", + "no_criminal_planning", + "no_guns_and_illegal_weapons", + "no_illegal_drugs", + "no_encourage_self_harm", + ] + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", policies=all_policies + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == ["O1", "O2", "O3", "O4", "O5", "O6"] + + def test_build_with_single_policy(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", policies=["no_criminal_planning"] + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["policies"] == ["O3"] + + def test_build_with_invalid_policy_raises(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", policies=["O1"] + ) + + with patch(_LLAMAGUARD_PATCH): + with pytest.raises(ValueError, match="Unknown policy"): + config.build() + + def test_build_returns_validator_instance(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b") + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + result = config.build() + + assert result == mock_validator.return_value + + def test_on_fail_fix_resolves_to_fix_action(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b", on_fail="fix") + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.FIX + + def test_on_fail_exception_resolves_to_exception_action(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", on_fail="exception" + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.EXCEPTION + + def test_on_fail_rephrase_resolves_to_callable(self): + config = LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", on_fail="rephrase" + ) + + with patch(_LLAMAGUARD_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert callable(kwargs["on_fail"]) + + def test_invalid_on_fail_raises(self): + config = LlamaGuard7BSafetyValidatorConfig(type="llamaguard_7b") + config.on_fail = "not_a_valid_action" # type: ignore[assignment] + + with patch(_LLAMAGUARD_PATCH): + with pytest.raises(ValueError, match="Invalid on_fail"): + config.build() + + def test_wrong_type_literal_rejected(self): + with pytest.raises(ValidationError): + LlamaGuard7BSafetyValidatorConfig(type="toxic_language") + + def test_extra_fields_rejected(self): + with pytest.raises(ValidationError): + LlamaGuard7BSafetyValidatorConfig( + type="llamaguard_7b", unknown_field="value" + ) + + +# --------------------------------------------------------------------------- +# ProfanityFree +# --------------------------------------------------------------------------- + + +class TestProfanityFreeSafetyValidatorConfig: + def test_build_default(self): + config = ProfanityFreeSafetyValidatorConfig(type="profanity_free") + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + mock_validator.assert_called_once() + + def test_build_returns_validator_instance(self): + config = ProfanityFreeSafetyValidatorConfig(type="profanity_free") + + with patch(_PROFANITY_PATCH) as mock_validator: + result = config.build() + + assert result == mock_validator.return_value + + def test_on_fail_fix_resolves_to_fix_action(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="fix" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.FIX + + def test_on_fail_exception_resolves_to_exception_action(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="exception" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert kwargs["on_fail"] == OnFailAction.EXCEPTION + + def test_on_fail_rephrase_resolves_to_callable(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="rephrase" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert callable(kwargs["on_fail"]) + + def test_invalid_on_fail_raises(self): + config = ProfanityFreeSafetyValidatorConfig(type="profanity_free") + config.on_fail = "not_a_valid_action" # type: ignore[assignment] + + with patch(_PROFANITY_PATCH): + with pytest.raises(ValueError, match="Invalid on_fail"): + config.build() + + def test_wrong_type_literal_rejected(self): + with pytest.raises(ValidationError): + ProfanityFreeSafetyValidatorConfig(type="nsfw_text") + + def test_extra_fields_rejected(self): + with pytest.raises(ValidationError): + ProfanityFreeSafetyValidatorConfig( + type="profanity_free", unknown_field="value" + ) + + def test_only_on_fail_forwarded_to_validator(self): + config = ProfanityFreeSafetyValidatorConfig( + type="profanity_free", on_fail="fix" + ) + + with patch(_PROFANITY_PATCH) as mock_validator: + config.build() + + _, kwargs = mock_validator.call_args + assert set(kwargs.keys()) == {"on_fail"} diff --git a/backend/app/tests/test_validate_with_guard.py b/backend/app/tests/test_validate_with_guard.py index fb2abc4..d10df57 100644 --- a/backend/app/tests/test_validate_with_guard.py +++ b/backend/app/tests/test_validate_with_guard.py @@ -8,8 +8,8 @@ _validate_with_guard, ) from app.schemas.guardrail_config import GuardrailRequest -from app.tests.guardrails_mocks import MockResult -from app.tests.seed_data import ( +from app.tests.utils.guardrails_mocks import MockResult +from app.tests.seed.seed_data import ( VALIDATOR_TEST_ORGANIZATION_ID, VALIDATOR_TEST_PROJECT_ID, ) diff --git a/backend/app/tests/test_validator_configs.py b/backend/app/tests/test_validator_configs.py index c99fd1e..345ee1a 100644 --- a/backend/app/tests/test_validator_configs.py +++ b/backend/app/tests/test_validator_configs.py @@ -6,7 +6,7 @@ from app.crud.validator_config import validator_config_crud from app.core.enum import GuardrailOnFail, ValidatorType from app.models.config.validator_config import ValidatorConfig -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( VALIDATOR_TEST_CONFIG, VALIDATOR_TEST_ID, VALIDATOR_TEST_NAME, diff --git a/backend/app/tests/test_validator_configs_integration.py b/backend/app/tests/test_validator_configs_integration.py index e14cfef..58eead1 100644 --- a/backend/app/tests/test_validator_configs_integration.py +++ b/backend/app/tests/test_validator_configs_integration.py @@ -1,7 +1,7 @@ import uuid import pytest -from app.tests.seed_data import ( +from app.tests.seed.seed_data import ( VALIDATOR_INTEGRATION_ORGANIZATION_ID, VALIDATOR_INTEGRATION_PROJECT_ID, VALIDATOR_PAYLOADS, diff --git a/backend/app/tests/utils/__init__.py b/backend/app/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/tests/guardrails_mocks.py b/backend/app/tests/utils/guardrails_mocks.py similarity index 100% rename from backend/app/tests/guardrails_mocks.py rename to backend/app/tests/utils/guardrails_mocks.py diff --git a/backend/pyproject.toml b/backend/pyproject.toml index b335986..6d1e84e 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "numpy>=1.24.0", "python-dotenv<2.0.0,>=1.0.0", "scikit-learn>=1.6.0,<2.0.0", + "huggingface-hub>=1.5.0,<2.0", ] [dependency-groups] diff --git a/backend/scripts/install_guardrails_from_hub.sh b/backend/scripts/install_guardrails_from_hub.sh index 5cff63e..ffeea3a 100755 --- a/backend/scripts/install_guardrails_from_hub.sh +++ b/backend/scripts/install_guardrails_from_hub.sh @@ -6,7 +6,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" GUARDRAILS_HUB_API_KEY="${GUARDRAILS_HUB_API_KEY:-}" ENABLE_METRICS="${ENABLE_METRICS:-false}" -ENABLE_REMOTE_INFERENCING="${ENABLE_REMOTE_INFERENCING:-false}" +ENABLE_REMOTE_INFERENCING="${ENABLE_REMOTE_INFERENCING:-true}" BACKEND_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" MANIFEST_FILE="${1:-$BACKEND_DIR/app/core/validators/validators.json}"