diff --git a/rag_experiment_accelerator/evaluation/llm_based_metrics.py b/rag_experiment_accelerator/evaluation/llm_based_metrics.py index 4d722f2e..809a728e 100644 --- a/rag_experiment_accelerator/evaluation/llm_based_metrics.py +++ b/rag_experiment_accelerator/evaluation/llm_based_metrics.py @@ -128,25 +128,26 @@ def llm_context_recall( double: The context recall score generated between the ground truth (expected) and context. """ context = "\n".join(retrieved_contexts) - prompt = ( - "\nquestion: " - + question - + "\ncontext: " - + context - + "\nanswer: " - + groundtruth_answer - ) - result = response_generator.generate_response( - sys_message=llm_context_recall_instruction, - prompt=prompt, + + result: list | None = response_generator.generate_response( + llm_context_recall_instruction, + context=context, + question=question, + answer=groundtruth_answer, ) - good_response = '"Attributed": "1"' - bad_response = '"Attributed": "0"' - return ( - result.count(good_response) - / (result.count(good_response) + result.count(bad_response)) - ) * 100 + good_responses = 0 + + for response in result: + try: + score = response.get("attributed", 0) + good_responses += int(score) + except ValueError: + logger.warning(f"Unable to parse {score} as int.") + if not result: + return -1 + else: + return (good_responses / len(result)) * 100 def compute_llm_based_score( diff --git a/rag_experiment_accelerator/evaluation/tests/test_llm_based_metrics.py b/rag_experiment_accelerator/evaluation/tests/test_llm_based_metrics.py index cd7eb3b9..f7240fc4 100644 --- a/rag_experiment_accelerator/evaluation/tests/test_llm_based_metrics.py +++ b/rag_experiment_accelerator/evaluation/tests/test_llm_based_metrics.py @@ -46,9 +46,28 @@ def test_llm_context_precision(mock_generate_response): @patch("rag_experiment_accelerator.evaluation.eval.ResponseGenerator") def test_llm_context_recall(mock_generate_response): - mock_generate_response.generate_response.return_value = ( - '"Attributed": "1" "Attributed": "1" "Attributed": "1" "Attributed": "0"' - ) + mock_generate_response.generate_response.return_value = [ + { + "statement_1": "Test statement 1", + "reason": "The statement is in the context", + "attributed": "1", + }, + { + "statement_2": "Test statement 2", + "reason": "The statement is in the context", + "attributed": "1", + }, + { + "statement_3": "Test statement 3", + "reason": "The statement is in the context", + "attributed": "0", + }, + { + "statement_4": "Test statement 4", + "reason": "The statement is in the context", + "attributed": "1", + }, + ] question = "What is the name of the largest bone in the human body?" context = 'According to the Cleveland Clinic, "The femur is the largest and strongest bone in the human body. It can support as much as 30 times the weight of your body. The average adult male femur is 48 cm (18.9 in) in length and 2.34 cm (0.92 in) in diameter. The average weight among adult males in the United States is 196 lbs (872 N). Therefore, the adult male femur can support roughly 6,000 lbs of compressive force."' answer = "The largest bone in the human body is the femur, also known as the thigh bone. It is about 19.4 inches (49.5 cm) long on average and can support up to 30 times the weight of a person’s body." diff --git a/rag_experiment_accelerator/llm/prompt/ragas_prompts.py b/rag_experiment_accelerator/llm/prompt/ragas_prompts.py index 0bf64a18..8e6a8438 100644 --- a/rag_experiment_accelerator/llm/prompt/ragas_prompts.py +++ b/rag_experiment_accelerator/llm/prompt/ragas_prompts.py @@ -16,11 +16,10 @@ def validate_context_recall(text: str) -> bool: def is_valid_entry(entry): statement_key_pattern = re.compile(r"^statement_\d+$") - for key in entry.keys(): - if key not in ["reason", "attributed"] or not statement_key_pattern.match( - key - ): - return False + return all( + key in ["reason", "attributed"] or statement_key_pattern.match(key) + for key in entry.keys() + ) return isinstance(json_text, list) and all( is_valid_entry(entry) for entry in json_text