Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions rag_experiment_accelerator/evaluation/llm_based_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,25 +128,26 @@ def llm_context_recall(
double: The context recall score generated between the ground truth (expected) and context.
"""
context = "\n".join(retrieved_contexts)
prompt = (
"\nquestion: "
+ question
+ "\ncontext: "
+ context
+ "\nanswer: "
+ groundtruth_answer
)
result = response_generator.generate_response(
sys_message=llm_context_recall_instruction,
prompt=prompt,

result: list | None = response_generator.generate_response(
llm_context_recall_instruction,
context=context,
question=question,
answer=groundtruth_answer,
)
good_response = '"Attributed": "1"'
bad_response = '"Attributed": "0"'

return (
result.count(good_response)
/ (result.count(good_response) + result.count(bad_response))
) * 100
good_responses = 0

for response in result:
try:
score = response.get("attributed", 0)
good_responses += int(score)
except ValueError:
logger.warning(f"Unable to parse {score} as int.")
if not result:
return -1
else:
return (good_responses / len(result)) * 100


def compute_llm_based_score(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,28 @@ def test_llm_context_precision(mock_generate_response):

@patch("rag_experiment_accelerator.evaluation.eval.ResponseGenerator")
def test_llm_context_recall(mock_generate_response):
mock_generate_response.generate_response.return_value = (
'"Attributed": "1" "Attributed": "1" "Attributed": "1" "Attributed": "0"'
)
mock_generate_response.generate_response.return_value = [
{
"statement_1": "Test statement 1",
"reason": "The statement is in the context",
"attributed": "1",
},
{
"statement_2": "Test statement 2",
"reason": "The statement is in the context",
"attributed": "1",
},
{
"statement_3": "Test statement 3",
"reason": "The statement is in the context",
"attributed": "0",
},
{
"statement_4": "Test statement 4",
"reason": "The statement is in the context",
"attributed": "1",
},
]
question = "What is the name of the largest bone in the human body?"
context = 'According to the Cleveland Clinic, "The femur is the largest and strongest bone in the human body. It can support as much as 30 times the weight of your body. The average adult male femur is 48 cm (18.9 in) in length and 2.34 cm (0.92 in) in diameter. The average weight among adult males in the United States is 196 lbs (872 N). Therefore, the adult male femur can support roughly 6,000 lbs of compressive force."'
answer = "The largest bone in the human body is the femur, also known as the thigh bone. It is about 19.4 inches (49.5 cm) long on average and can support up to 30 times the weight of a person’s body."
Expand Down
9 changes: 4 additions & 5 deletions rag_experiment_accelerator/llm/prompt/ragas_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@ def validate_context_recall(text: str) -> bool:

def is_valid_entry(entry):
statement_key_pattern = re.compile(r"^statement_\d+$")
for key in entry.keys():
if key not in ["reason", "attributed"] or not statement_key_pattern.match(
key
):
return False
return all(
key in ["reason", "attributed"] or statement_key_pattern.match(key)
for key in entry.keys()
)

return isinstance(json_text, list) and all(
is_valid_entry(entry) for entry in json_text
Expand Down
Loading