From fafdddcf923baa94e48307d4910e3f3565af1b1c Mon Sep 17 00:00:00 2001
From: aprilk-ms <55356546+aprilk-ms@users.noreply.github.com>
Date: Fri, 2 May 2025 03:27:13 +0000
Subject: [PATCH 1/3] Add draft evaluate script

---
 evals/evaluate.py    | 167 +++++++++++++++++++++++++++++++++++++++++++
 evals/test-data.json |  10 +++
 2 files changed, 177 insertions(+)
 create mode 100644 evals/evaluate.py
 create mode 100644 evals/test-data.json

diff --git a/evals/evaluate.py b/evals/evaluate.py
new file mode 100644
index 00000000..8f505bef
--- /dev/null
+++ b/evals/evaluate.py
@@ -0,0 +1,167 @@
+from azure.ai.projects import AIProjectClient
+from azure.ai.projects.models import Agent, ConnectionType, MessageRole, RunStatus
+from azure.identity import DefaultAzureCredential
+from azure.ai.evaluation import AIAgentConverter, evaluate, FluencyEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator
+
+import os
+import time
+import json
+from pathlib import Path
+from dotenv import load_dotenv
+
+def run_evaluation():
+    """Demonstrate how to evaluate an AI agent using the Azure AI Project SDK"""
+    current_dir = Path(__file__).parent
+    eval_input_path = current_dir / f"eval-input.jsonl"
+    eval_output_path = current_dir / f"eval-output.json"
+
+    env_path = current_dir / "../src/.env"
+    load_dotenv(dotenv_path=env_path)
+
+    # Get AI project parameters from environment variables
+    AZURE_AIPROJECT_CONNECTION_STRING = (
+        os.environ.get("AZURE_EXISTING_AIPROJECT_CONNECTION_STRING") or 
+        os.environ.get("AZURE_AIPROJECT_CONNECTION_STRING")
+    )
+    AZURE_AI_AGENT_DEPLOYMENT_NAME = os.getenv("AZURE_AI_AGENT_DEPLOYMENT_NAME")
+    API_VERSION = os.getenv("API_VERSION") or ""
+    AGENT_ID = (
+        os.environ.get("AZURE_EXISTING_AGENT_ID") or 
+        os.environ.get("AZURE_AI_AGENT_ID")
+    )
+
+    # Initialize the AIProjectClient and related entities
+    project_client = AIProjectClient.from_connection_string(
+        AZURE_AIPROJECT_CONNECTION_STRING, 
+        credential=DefaultAzureCredential()
+    )
+    default_connection = project_client.connections.get_default(
+        connection_type=ConnectionType.AZURE_OPEN_AI, include_credentials=True
+    )
+    model_config = default_connection.to_evaluator_model_config(
+        deployment_name=AZURE_AI_AGENT_DEPLOYMENT_NAME,
+        api_version=API_VERSION,
+        include_credentials=True,
+    )
+    agent = project_client.agents.get_agent(AGENT_ID)
+    thread_data_converter = AIAgentConverter(project_client)
+
+    # Read data input file 
+    with open(current_dir / "test-data.json", "r", encoding="utf-8") as f:
+        test_data = json.load(f)
+    
+    # Execute the test data against the agent and prepare the evaluation input
+    with open(eval_input_path, "w", encoding="utf-8") as f:        
+
+        for row in test_data:
+            # Create a new thread for each query to isolate conversations
+            thread = project_client.agents.create_thread()
+            
+            # Send the user query
+            project_client.agents.create_message(
+                thread.id, role=MessageRole.USER, content=row.get("query")
+            )
+
+            # Run the agent and measure performance
+            start_time = time.time()
+            run = project_client.agents.create_and_process_run(
+                thread_id=thread.id, agent_id=agent.id
+            )
+            end_time = time.time()
+
+            if run.status != RunStatus.COMPLETED:
+                raise ValueError(run.last_error or "Run failed to complete")
+
+            metrics = {
+                "server-run-duration-in-seconds": (
+                    run.completed_at - run.created_at
+                ).total_seconds(),
+                "client-run-duration-in-seconds": end_time - start_time,
+                "completion-tokens": run.usage.completion_tokens,
+                "prompt-tokens": run.usage.prompt_tokens,
+                "ground-truth": row.get("ground-truth", '')
+            }
+            
+            # Add thread data + operational metrics to the evaluation input
+            evaluation_data = thread_data_converter.prepare_evaluation_data(thread_ids=thread.id)
+            eval_item = evaluation_data[0]
+            eval_item["metrics"] = metrics
+            f.write(json.dumps(eval_item) + "\n")   
+        
+
+    # Now, run a sample set of evaluators using the evaluation input
+    # See https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk
+    # for the full list of evaluators availalbe
+    tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
+    intent_resolution = IntentResolutionEvaluator(model_config=model_config)
+    task_adherence = TaskAdherenceEvaluator(model_config=model_config)
+    results = evaluate(
+        data=eval_input_path,
+        evaluators={
+            "tool_call_accuracy": tool_call_accuracy,
+            "intent_resolution": intent_resolution,
+            "task_adherence": task_adherence,
+            "operational_metrics": OperationalMetricsEvaluator(),
+        },
+        #azure_ai_project=project_client.scope, # uncomment to upload result to AI Foundry
+        output_path=eval_output_path
+    )
+
+    # Print the evaluation results
+    print_eval_results(results, eval_input_path, eval_output_path)
+    
+    return results
+
+
+class OperationalMetricsEvaluator:
+    """Propagate operational metrics to the final evaluation results"""
+    def __init__(self):
+        pass
+    def __call__(self, *, metrics: dict, **kwargs):
+        return metrics
+
+
+def print_eval_results(results, input_path, output_path):
+    """Print the evaluation results in a formatted table"""    
+    metrics = results.get("metrics", {})
+
+    # Get the maximum length for formatting
+    key_len = max(len(key) for key in metrics.keys()) + 5
+    value_len = 20
+    full_len = key_len + value_len + 5
+    
+    # Format the header
+    print("\n" + "=" * full_len)
+    print("Evaluation Results".center(full_len))
+    print("=" * full_len)
+    
+    # Print each metric
+    print(f"{'Metric':<{key_len}} | {'Value'}")
+    print("-" * (key_len) + "-+-" + "-" * value_len)
+    
+    for key, value in metrics.items():
+        if isinstance(value, float):
+            formatted_value = f"{value:.2f}"
+        else:
+            formatted_value = str(value)
+        
+        print(f"{key:<{key_len}} | {formatted_value}")
+    
+    print("=" * full_len + "\n")
+
+    # Print additional information
+    print(f"Evaluation input: {input_path}")
+    print(f"Evaluation output: {output_path}")
+    if "studio_url" in results:
+        print(f"AI Foundry URL: {results['studio_url']}")
+
+    print("\n" + "=" * full_len + "\n")
+
+
+if __name__ == "__main__":
+    try:
+        run_evaluation()
+    except Exception as e:
+        print(f"Error during evaluation: {e}")
+
+
diff --git a/evals/test-data.json b/evals/test-data.json
new file mode 100644
index 00000000..f3d73eab
--- /dev/null
+++ b/evals/test-data.json
@@ -0,0 +1,10 @@
+[
+    {
+        "query": "What is the capital of France?",
+        "ground-truth": "The capital of France is Paris."
+    },
+    {
+        "query": "What is the capital of Japan?",
+        "ground-truth": "The capital of Japan is Tokyo."
+    }
+]
\ No newline at end of file

From 96d912350078c37202d32b14e5cf482ce6f0e3bc Mon Sep 17 00:00:00 2001
From: aprilk-ms <55356546+aprilk-ms@users.noreply.github.com>
Date: Fri, 2 May 2025 18:58:21 +0000
Subject: [PATCH 2/3] Add eval github workflow

---
 .github/workflows/ai-evaluation.yml | 33 +++++++++++++++++++++++++++++
 azure.yaml                          |  3 ++-
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ai-evaluation.yml

diff --git a/.github/workflows/ai-evaluation.yml b/.github/workflows/ai-evaluation.yml
new file mode 100644
index 00000000..2e7501e0
--- /dev/null
+++ b/.github/workflows/ai-evaluation.yml
@@ -0,0 +1,33 @@
+name: "AI Agent Evaluation"
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  run-action:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Azure login using Federated Credentials
+        uses: azure/login@v2
+        with:
+          client-id: ${{ vars.AZURE_CLIENT_ID }}
+          tenant-id: ${{ vars.AZURE_TENANT_ID }}
+          subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }}
+
+      - name: Run Evaluation
+        uses: microsoft/ai-agent-evals@v1-beta
+        with:
+          azure-aiproject-connection-string: ${{ vars.AZURE_EXISTING_AIPROJECT_CONNECTION_STRING || vars.AZURE_AIPROJECT_CONNECTION_STRING }}
+          deployment-name: ${{ vars.AZURE_AI_AGENT_DEPLOYMENT_NAME }}
+          agent-ids: ${{ vars.AZURE_EXISTING_AGENT_ID || vars.AZURE_AI_AGENT_ID }}
+          data-path: ${{ github.workspace }}/evals/test-data.json
\ No newline at end of file
diff --git a/azure.yaml b/azure.yaml
index 91e8ebc7..910c3a8c 100644
--- a/azure.yaml
+++ b/azure.yaml
@@ -50,4 +50,5 @@ pipeline:
     - AZURE_AI_EMBED_MODEL_VERSION
     - AZURE_AI_EMBED_DIMENSIONS
     - AZURE_AI_SEARCH_INDEX_NAME
-    - AZURE_EXISTING_AIPROJECT_CONNECTION_STRING
\ No newline at end of file
+    - AZURE_EXISTING_AIPROJECT_CONNECTION_STRING
+    - AZURE_EXISTING_AGENT_ID
\ No newline at end of file

From 96f6fe0068a03d8ab1854dcffaa5c5f8c0970c76 Mon Sep 17 00:00:00 2001
From: aprilk-ms <55356546+aprilk-ms@users.noreply.github.com>
Date: Sat, 3 May 2025 00:53:10 +0000
Subject: [PATCH 3/3] dataset updates

---
 .github/workflows/ai-evaluation.yml |  2 +-
 evals/eval-queries.json             | 14 ++++++++++++++
 evals/evaluate.py                   | 10 +++++-----
 evals/test-data-workflow.json       | 20 ++++++++++++++++++++
 evals/test-data.json                | 10 ----------
 5 files changed, 40 insertions(+), 16 deletions(-)
 create mode 100644 evals/eval-queries.json
 create mode 100644 evals/test-data-workflow.json
 delete mode 100644 evals/test-data.json

diff --git a/.github/workflows/ai-evaluation.yml b/.github/workflows/ai-evaluation.yml
index 2e7501e0..d1193b16 100644
--- a/.github/workflows/ai-evaluation.yml
+++ b/.github/workflows/ai-evaluation.yml
@@ -30,4 +30,4 @@ jobs:
           azure-aiproject-connection-string: ${{ vars.AZURE_EXISTING_AIPROJECT_CONNECTION_STRING || vars.AZURE_AIPROJECT_CONNECTION_STRING }}
           deployment-name: ${{ vars.AZURE_AI_AGENT_DEPLOYMENT_NAME }}
           agent-ids: ${{ vars.AZURE_EXISTING_AGENT_ID || vars.AZURE_AI_AGENT_ID }}
-          data-path: ${{ github.workspace }}/evals/test-data.json
\ No newline at end of file
+          data-path: ${{ github.workspace }}/evals/test-data-workflow.json
\ No newline at end of file
diff --git a/evals/eval-queries.json b/evals/eval-queries.json
new file mode 100644
index 00000000..0e21eb0a
--- /dev/null
+++ b/evals/eval-queries.json
@@ -0,0 +1,14 @@
+[
+    {
+        "query": "What features do the SmartView Glasses have?",
+        "ground-truth": "The SmartView Glasses (product item 1) feature Augmented Reality interface, Voice-controlled AI assistant, HD video recording with 3D audio, UV protection and blue light filtering, and Wireless charging with extended battery life."
+    },
+    {
+        "query": "How long is the warranty on the SmartView Glasses?",
+        "ground-truth": "The SmartView Glasses come with a two-year limited warranty on all electronic components."
+    },
+    {
+        "query": "How do I clean the BaseCamp Folding Table?",
+        "ground-truth": "To clean the BaseCamp Folding Table, simply wipe the aluminum surface with a damp cloth and mild detergent, then air dry. Avoid using abrasive cleaners or solvents that may damage the table."
+    }
+]
\ No newline at end of file
diff --git a/evals/evaluate.py b/evals/evaluate.py
index 8f505bef..efc0f69b 100644
--- a/evals/evaluate.py
+++ b/evals/evaluate.py
@@ -12,6 +12,7 @@
 def run_evaluation():
     """Demonstrate how to evaluate an AI agent using the Azure AI Project SDK"""
     current_dir = Path(__file__).parent
+    eval_queries_path = current_dir / "eval-queries.json"
     eval_input_path = current_dir / f"eval-input.jsonl"
     eval_output_path = current_dir / f"eval-output.json"
 
@@ -47,7 +48,7 @@ def run_evaluation():
     thread_data_converter = AIAgentConverter(project_client)
 
     # Read data input file 
-    with open(current_dir / "test-data.json", "r", encoding="utf-8") as f:
+    with open(eval_queries_path, "r", encoding="utf-8") as f:
         test_data = json.load(f)
     
     # Execute the test data against the agent and prepare the evaluation input
@@ -103,8 +104,8 @@ def run_evaluation():
             "task_adherence": task_adherence,
             "operational_metrics": OperationalMetricsEvaluator(),
         },
-        #azure_ai_project=project_client.scope, # uncomment to upload result to AI Foundry
-        output_path=eval_output_path
+        output_path=eval_output_path, # raw evaluation results
+        azure_ai_project=project_client.scope, # needed only if you want results uploaded to AI Foundry
     )
 
     # Print the evaluation results
@@ -112,7 +113,6 @@ def run_evaluation():
     
     return results
 
-
 class OperationalMetricsEvaluator:
     """Propagate operational metrics to the final evaluation results"""
     def __init__(self):
@@ -152,7 +152,7 @@ def print_eval_results(results, input_path, output_path):
     # Print additional information
     print(f"Evaluation input: {input_path}")
     print(f"Evaluation output: {output_path}")
-    if "studio_url" in results:
+    if results.get("studio_url") is not None:
         print(f"AI Foundry URL: {results['studio_url']}")
 
     print("\n" + "=" * full_len + "\n")
diff --git a/evals/test-data-workflow.json b/evals/test-data-workflow.json
new file mode 100644
index 00000000..209cd463
--- /dev/null
+++ b/evals/test-data-workflow.json
@@ -0,0 +1,20 @@
+{
+    "name": "test-dataset",
+    "evaluators": [
+      "IntentResolutionEvaluator",
+      "TaskAdherenceEvaluator",
+      "ContentSafetyEvaluator"
+    ],
+    "data":
+    [
+        {
+            "query": "What features do the SmartView Glasses have?"
+        },
+        {
+            "query": "How long is the warranty on the SmartView Glasses?"
+        },
+        {
+            "query": "How do I clean the BaseCamp Folding Table?"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/evals/test-data.json b/evals/test-data.json
deleted file mode 100644
index f3d73eab..00000000
--- a/evals/test-data.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "query": "What is the capital of France?",
-        "ground-truth": "The capital of France is Paris."
-    },
-    {
-        "query": "What is the capital of Japan?",
-        "ground-truth": "The capital of Japan is Tokyo."
-    }
-]
\ No newline at end of file