From fafdddcf923baa94e48307d4910e3f3565af1b1c Mon Sep 17 00:00:00 2001 From: aprilk-ms <55356546+aprilk-ms@users.noreply.github.com> Date: Fri, 2 May 2025 03:27:13 +0000 Subject: [PATCH 1/3] Add draft evaluate script --- evals/evaluate.py | 167 +++++++++++++++++++++++++++++++++++++++++++ evals/test-data.json | 10 +++ 2 files changed, 177 insertions(+) create mode 100644 evals/evaluate.py create mode 100644 evals/test-data.json diff --git a/evals/evaluate.py b/evals/evaluate.py new file mode 100644 index 00000000..8f505bef --- /dev/null +++ b/evals/evaluate.py @@ -0,0 +1,167 @@ +from azure.ai.projects import AIProjectClient +from azure.ai.projects.models import Agent, ConnectionType, MessageRole, RunStatus +from azure.identity import DefaultAzureCredential +from azure.ai.evaluation import AIAgentConverter, evaluate, FluencyEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator + +import os +import time +import json +from pathlib import Path +from dotenv import load_dotenv + +def run_evaluation(): + """Demonstrate how to evaluate an AI agent using the Azure AI Project SDK""" + current_dir = Path(__file__).parent + eval_input_path = current_dir / f"eval-input.jsonl" + eval_output_path = current_dir / f"eval-output.json" + + env_path = current_dir / "../src/.env" + load_dotenv(dotenv_path=env_path) + + # Get AI project parameters from environment variables + AZURE_AIPROJECT_CONNECTION_STRING = ( + os.environ.get("AZURE_EXISTING_AIPROJECT_CONNECTION_STRING") or + os.environ.get("AZURE_AIPROJECT_CONNECTION_STRING") + ) + AZURE_AI_AGENT_DEPLOYMENT_NAME = os.getenv("AZURE_AI_AGENT_DEPLOYMENT_NAME") + API_VERSION = os.getenv("API_VERSION") or "" + AGENT_ID = ( + os.environ.get("AZURE_EXISTING_AGENT_ID") or + os.environ.get("AZURE_AI_AGENT_ID") + ) + + # Initialize the AIProjectClient and related entities + project_client = AIProjectClient.from_connection_string( + AZURE_AIPROJECT_CONNECTION_STRING, + credential=DefaultAzureCredential() + ) + default_connection = project_client.connections.get_default( + connection_type=ConnectionType.AZURE_OPEN_AI, include_credentials=True + ) + model_config = default_connection.to_evaluator_model_config( + deployment_name=AZURE_AI_AGENT_DEPLOYMENT_NAME, + api_version=API_VERSION, + include_credentials=True, + ) + agent = project_client.agents.get_agent(AGENT_ID) + thread_data_converter = AIAgentConverter(project_client) + + # Read data input file + with open(current_dir / "test-data.json", "r", encoding="utf-8") as f: + test_data = json.load(f) + + # Execute the test data against the agent and prepare the evaluation input + with open(eval_input_path, "w", encoding="utf-8") as f: + + for row in test_data: + # Create a new thread for each query to isolate conversations + thread = project_client.agents.create_thread() + + # Send the user query + project_client.agents.create_message( + thread.id, role=MessageRole.USER, content=row.get("query") + ) + + # Run the agent and measure performance + start_time = time.time() + run = project_client.agents.create_and_process_run( + thread_id=thread.id, agent_id=agent.id + ) + end_time = time.time() + + if run.status != RunStatus.COMPLETED: + raise ValueError(run.last_error or "Run failed to complete") + + metrics = { + "server-run-duration-in-seconds": ( + run.completed_at - run.created_at + ).total_seconds(), + "client-run-duration-in-seconds": end_time - start_time, + "completion-tokens": run.usage.completion_tokens, + "prompt-tokens": run.usage.prompt_tokens, + "ground-truth": row.get("ground-truth", '') + } + + # Add thread data + operational metrics to the evaluation input + evaluation_data = thread_data_converter.prepare_evaluation_data(thread_ids=thread.id) + eval_item = evaluation_data[0] + eval_item["metrics"] = metrics + f.write(json.dumps(eval_item) + "\n") + + + # Now, run a sample set of evaluators using the evaluation input + # See https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk + # for the full list of evaluators availalbe + tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config) + intent_resolution = IntentResolutionEvaluator(model_config=model_config) + task_adherence = TaskAdherenceEvaluator(model_config=model_config) + results = evaluate( + data=eval_input_path, + evaluators={ + "tool_call_accuracy": tool_call_accuracy, + "intent_resolution": intent_resolution, + "task_adherence": task_adherence, + "operational_metrics": OperationalMetricsEvaluator(), + }, + #azure_ai_project=project_client.scope, # uncomment to upload result to AI Foundry + output_path=eval_output_path + ) + + # Print the evaluation results + print_eval_results(results, eval_input_path, eval_output_path) + + return results + + +class OperationalMetricsEvaluator: + """Propagate operational metrics to the final evaluation results""" + def __init__(self): + pass + def __call__(self, *, metrics: dict, **kwargs): + return metrics + + +def print_eval_results(results, input_path, output_path): + """Print the evaluation results in a formatted table""" + metrics = results.get("metrics", {}) + + # Get the maximum length for formatting + key_len = max(len(key) for key in metrics.keys()) + 5 + value_len = 20 + full_len = key_len + value_len + 5 + + # Format the header + print("\n" + "=" * full_len) + print("Evaluation Results".center(full_len)) + print("=" * full_len) + + # Print each metric + print(f"{'Metric':<{key_len}} | {'Value'}") + print("-" * (key_len) + "-+-" + "-" * value_len) + + for key, value in metrics.items(): + if isinstance(value, float): + formatted_value = f"{value:.2f}" + else: + formatted_value = str(value) + + print(f"{key:<{key_len}} | {formatted_value}") + + print("=" * full_len + "\n") + + # Print additional information + print(f"Evaluation input: {input_path}") + print(f"Evaluation output: {output_path}") + if "studio_url" in results: + print(f"AI Foundry URL: {results['studio_url']}") + + print("\n" + "=" * full_len + "\n") + + +if __name__ == "__main__": + try: + run_evaluation() + except Exception as e: + print(f"Error during evaluation: {e}") + + diff --git a/evals/test-data.json b/evals/test-data.json new file mode 100644 index 00000000..f3d73eab --- /dev/null +++ b/evals/test-data.json @@ -0,0 +1,10 @@ +[ + { + "query": "What is the capital of France?", + "ground-truth": "The capital of France is Paris." + }, + { + "query": "What is the capital of Japan?", + "ground-truth": "The capital of Japan is Tokyo." + } +] \ No newline at end of file From 96d912350078c37202d32b14e5cf482ce6f0e3bc Mon Sep 17 00:00:00 2001 From: aprilk-ms <55356546+aprilk-ms@users.noreply.github.com> Date: Fri, 2 May 2025 18:58:21 +0000 Subject: [PATCH 2/3] Add eval github workflow --- .github/workflows/ai-evaluation.yml | 33 +++++++++++++++++++++++++++++ azure.yaml | 3 ++- 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ai-evaluation.yml diff --git a/.github/workflows/ai-evaluation.yml b/.github/workflows/ai-evaluation.yml new file mode 100644 index 00000000..2e7501e0 --- /dev/null +++ b/.github/workflows/ai-evaluation.yml @@ -0,0 +1,33 @@ +name: "AI Agent Evaluation" + +on: + workflow_dispatch: + push: + branches: + - main + +permissions: + id-token: write + contents: read + +jobs: + run-action: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Azure login using Federated Credentials + uses: azure/login@v2 + with: + client-id: ${{ vars.AZURE_CLIENT_ID }} + tenant-id: ${{ vars.AZURE_TENANT_ID }} + subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + + - name: Run Evaluation + uses: microsoft/ai-agent-evals@v1-beta + with: + azure-aiproject-connection-string: ${{ vars.AZURE_EXISTING_AIPROJECT_CONNECTION_STRING || vars.AZURE_AIPROJECT_CONNECTION_STRING }} + deployment-name: ${{ vars.AZURE_AI_AGENT_DEPLOYMENT_NAME }} + agent-ids: ${{ vars.AZURE_EXISTING_AGENT_ID || vars.AZURE_AI_AGENT_ID }} + data-path: ${{ github.workspace }}/evals/test-data.json \ No newline at end of file diff --git a/azure.yaml b/azure.yaml index 91e8ebc7..910c3a8c 100644 --- a/azure.yaml +++ b/azure.yaml @@ -50,4 +50,5 @@ pipeline: - AZURE_AI_EMBED_MODEL_VERSION - AZURE_AI_EMBED_DIMENSIONS - AZURE_AI_SEARCH_INDEX_NAME - - AZURE_EXISTING_AIPROJECT_CONNECTION_STRING \ No newline at end of file + - AZURE_EXISTING_AIPROJECT_CONNECTION_STRING + - AZURE_EXISTING_AGENT_ID \ No newline at end of file From 96f6fe0068a03d8ab1854dcffaa5c5f8c0970c76 Mon Sep 17 00:00:00 2001 From: aprilk-ms <55356546+aprilk-ms@users.noreply.github.com> Date: Sat, 3 May 2025 00:53:10 +0000 Subject: [PATCH 3/3] dataset updates --- .github/workflows/ai-evaluation.yml | 2 +- evals/eval-queries.json | 14 ++++++++++++++ evals/evaluate.py | 10 +++++----- evals/test-data-workflow.json | 20 ++++++++++++++++++++ evals/test-data.json | 10 ---------- 5 files changed, 40 insertions(+), 16 deletions(-) create mode 100644 evals/eval-queries.json create mode 100644 evals/test-data-workflow.json delete mode 100644 evals/test-data.json diff --git a/.github/workflows/ai-evaluation.yml b/.github/workflows/ai-evaluation.yml index 2e7501e0..d1193b16 100644 --- a/.github/workflows/ai-evaluation.yml +++ b/.github/workflows/ai-evaluation.yml @@ -30,4 +30,4 @@ jobs: azure-aiproject-connection-string: ${{ vars.AZURE_EXISTING_AIPROJECT_CONNECTION_STRING || vars.AZURE_AIPROJECT_CONNECTION_STRING }} deployment-name: ${{ vars.AZURE_AI_AGENT_DEPLOYMENT_NAME }} agent-ids: ${{ vars.AZURE_EXISTING_AGENT_ID || vars.AZURE_AI_AGENT_ID }} - data-path: ${{ github.workspace }}/evals/test-data.json \ No newline at end of file + data-path: ${{ github.workspace }}/evals/test-data-workflow.json \ No newline at end of file diff --git a/evals/eval-queries.json b/evals/eval-queries.json new file mode 100644 index 00000000..0e21eb0a --- /dev/null +++ b/evals/eval-queries.json @@ -0,0 +1,14 @@ +[ + { + "query": "What features do the SmartView Glasses have?", + "ground-truth": "The SmartView Glasses (product item 1) feature Augmented Reality interface, Voice-controlled AI assistant, HD video recording with 3D audio, UV protection and blue light filtering, and Wireless charging with extended battery life." + }, + { + "query": "How long is the warranty on the SmartView Glasses?", + "ground-truth": "The SmartView Glasses come with a two-year limited warranty on all electronic components." + }, + { + "query": "How do I clean the BaseCamp Folding Table?", + "ground-truth": "To clean the BaseCamp Folding Table, simply wipe the aluminum surface with a damp cloth and mild detergent, then air dry. Avoid using abrasive cleaners or solvents that may damage the table." + } +] \ No newline at end of file diff --git a/evals/evaluate.py b/evals/evaluate.py index 8f505bef..efc0f69b 100644 --- a/evals/evaluate.py +++ b/evals/evaluate.py @@ -12,6 +12,7 @@ def run_evaluation(): """Demonstrate how to evaluate an AI agent using the Azure AI Project SDK""" current_dir = Path(__file__).parent + eval_queries_path = current_dir / "eval-queries.json" eval_input_path = current_dir / f"eval-input.jsonl" eval_output_path = current_dir / f"eval-output.json" @@ -47,7 +48,7 @@ def run_evaluation(): thread_data_converter = AIAgentConverter(project_client) # Read data input file - with open(current_dir / "test-data.json", "r", encoding="utf-8") as f: + with open(eval_queries_path, "r", encoding="utf-8") as f: test_data = json.load(f) # Execute the test data against the agent and prepare the evaluation input @@ -103,8 +104,8 @@ def run_evaluation(): "task_adherence": task_adherence, "operational_metrics": OperationalMetricsEvaluator(), }, - #azure_ai_project=project_client.scope, # uncomment to upload result to AI Foundry - output_path=eval_output_path + output_path=eval_output_path, # raw evaluation results + azure_ai_project=project_client.scope, # needed only if you want results uploaded to AI Foundry ) # Print the evaluation results @@ -112,7 +113,6 @@ def run_evaluation(): return results - class OperationalMetricsEvaluator: """Propagate operational metrics to the final evaluation results""" def __init__(self): @@ -152,7 +152,7 @@ def print_eval_results(results, input_path, output_path): # Print additional information print(f"Evaluation input: {input_path}") print(f"Evaluation output: {output_path}") - if "studio_url" in results: + if results.get("studio_url") is not None: print(f"AI Foundry URL: {results['studio_url']}") print("\n" + "=" * full_len + "\n") diff --git a/evals/test-data-workflow.json b/evals/test-data-workflow.json new file mode 100644 index 00000000..209cd463 --- /dev/null +++ b/evals/test-data-workflow.json @@ -0,0 +1,20 @@ +{ + "name": "test-dataset", + "evaluators": [ + "IntentResolutionEvaluator", + "TaskAdherenceEvaluator", + "ContentSafetyEvaluator" + ], + "data": + [ + { + "query": "What features do the SmartView Glasses have?" + }, + { + "query": "How long is the warranty on the SmartView Glasses?" + }, + { + "query": "How do I clean the BaseCamp Folding Table?" + } + ] +} \ No newline at end of file diff --git a/evals/test-data.json b/evals/test-data.json deleted file mode 100644 index f3d73eab..00000000 --- a/evals/test-data.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "query": "What is the capital of France?", - "ground-truth": "The capital of France is Paris." - }, - { - "query": "What is the capital of Japan?", - "ground-truth": "The capital of Japan is Tokyo." - } -] \ No newline at end of file