microsoft · bderusha · Oct 23, 2024 · Oct 29, 2024 · Mar 31, 2025
diff --git a/rag_experiment_accelerator/evaluation/llm_based_metrics.py b/rag_experiment_accelerator/evaluation/llm_based_metrics.py
@@ -128,25 +128,26 @@ def llm_context_recall(
         double: The context recall score generated between the ground truth (expected) and context.
     """
     context = "\n".join(retrieved_contexts)
-    prompt = (
-        "\nquestion: "
-        + question
-        + "\ncontext: "
-        + context
-        + "\nanswer: "
-        + groundtruth_answer
-    )
-    result = response_generator.generate_response(
-        sys_message=llm_context_recall_instruction,
-        prompt=prompt,
+
+    result: list | None = response_generator.generate_response(
+        llm_context_recall_instruction,
+        context=context,
+        question=question,
+        answer=groundtruth_answer,
     )
-    good_response = '"Attributed": "1"'
-    bad_response = '"Attributed": "0"'
 
-    return (
-        result.count(good_response)
-        / (result.count(good_response) + result.count(bad_response))
-    ) * 100
+    good_responses = 0
+
+    for response in result:
+        try:
+            score = response.get("attributed", 0)
+            good_responses += int(score)
+        except ValueError:
+            logger.warning(f"Unable to parse {score} as int.")
+    if not result:
+        return -1
+    else:
+        return (good_responses / len(result)) * 100
 
 
 def compute_llm_based_score(

diff --git a/rag_experiment_accelerator/evaluation/tests/test_llm_based_metrics.py b/rag_experiment_accelerator/evaluation/tests/test_llm_based_metrics.py
@@ -46,9 +46,28 @@ def test_llm_context_precision(mock_generate_response):
 
 @patch("rag_experiment_accelerator.evaluation.eval.ResponseGenerator")
 def test_llm_context_recall(mock_generate_response):
-    mock_generate_response.generate_response.return_value = (
-        '"Attributed": "1"   "Attributed": "1"   "Attributed": "1"   "Attributed": "0"'
-    )
+    mock_generate_response.generate_response.return_value = [
+        {
+            "statement_1": "Test statement 1",
+            "reason": "The statement is in the context",
+            "attributed": "1",
+        },
+        {
+            "statement_2": "Test statement 2",
+            "reason": "The statement is in the context",
+            "attributed": "1",
+        },
+        {
+            "statement_3": "Test statement 3",
+            "reason": "The statement is in the context",
+            "attributed": "0",
+        },
+        {
+            "statement_4": "Test statement 4",
+            "reason": "The statement is in the context",
+            "attributed": "1",
+        },
+    ]
     question = "What is the name of the largest bone in the human body?"
     context = 'According to the Cleveland Clinic, "The femur is the largest and strongest bone in the human body. It can support as much as 30 times the weight of your body. The average adult male femur is 48 cm (18.9 in) in length and 2.34 cm (0.92 in) in diameter. The average weight among adult males in the United States is 196 lbs (872 N). Therefore, the adult male femur can support roughly 6,000 lbs of compressive force."'
     answer = "The largest bone in the human body is the femur, also known as the thigh bone. It is about 19.4 inches (49.5 cm) long on average and can support up to 30 times the weight of a person’s body."

diff --git a/rag_experiment_accelerator/llm/prompt/ragas_prompts.py b/rag_experiment_accelerator/llm/prompt/ragas_prompts.py
@@ -16,11 +16,10 @@ def validate_context_recall(text: str) -> bool:
 
     def is_valid_entry(entry):
         statement_key_pattern = re.compile(r"^statement_\d+$")
-        for key in entry.keys():
-            if key not in ["reason", "attributed"] or not statement_key_pattern.match(
-                key
-            ):
-                return False
+        return all(
+            key in ["reason", "attributed"] or statement_key_pattern.match(key)
+            for key in entry.keys()
+        )
 
     return isinstance(json_text, list) and all(
         is_valid_entry(entry) for entry in json_text