From 9cac83c32ac7c69eac260af623b310ba49ecc06b Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Tue, 30 Jul 2024 01:15:22 +0530 Subject: [PATCH] feat: add MT-bench operator --- tests/test_operators.py | 27 ++++++++ uptrain/operators/__init__.pyi | 1 + uptrain/operators/language/mtbench.py | 92 +++++++++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 uptrain/operators/language/mtbench.py diff --git a/tests/test_operators.py b/tests/test_operators.py index 61a5e7330..f947de4dc 100644 --- a/tests/test_operators.py +++ b/tests/test_operators.py @@ -391,3 +391,30 @@ def test_text_comparison_operator(): # Print the comparison results print(comparison) + +def test_mtbench_operator(): + import polars as pl + from uptrain.operators import MTBenchScore + + # Create a DataFrame + df = pl.DataFrame( + { + "dialogue_generated": [ + ["Hi", "Hello"], + ["How are you?", "I'm fine, thanks."] + ], + "dialogue_source": [ + ["Hello", "Hi"], + ["How do you do?", "I'm good, thank you."] + ], + } + ) + + # Create an instance of the MTBenchScore class + mtbench_op = MTBenchScore() + + # Calculate the MT-Bench scores + scores = mtbench_op.run(df)["output"] + + # Print the MT-Bench scores + print(scores) diff --git a/uptrain/operators/__init__.pyi b/uptrain/operators/__init__.pyi index ea84f3d06..07dc09705 100644 --- a/uptrain/operators/__init__.pyi +++ b/uptrain/operators/__init__.pyi @@ -134,6 +134,7 @@ from .chart import ( ) from . import language +from .language.mtbench import MTBenchScore from .language.grammar import GrammarScore from .language.openai_evals import OpenaiEval, PromptEval from .language.rouge import RougeScore diff --git a/uptrain/operators/language/mtbench.py b/uptrain/operators/language/mtbench.py new file mode 100644 index 000000000..5ff572ac2 --- /dev/null +++ b/uptrain/operators/language/mtbench.py @@ -0,0 +1,92 @@ +""" +Implement checks to evaluate multi-turn dialogues using MT-Bench. + +This module provides the `MTBenchScore` class, which allows comparing generated multi-turn dialogues with source dialogues to evaluate their alignment using the MT-Bench score metric. + +""" + +from __future__ import annotations +import typing as t + +import polars as pl +from uptrain.framework import Settings + +if t.TYPE_CHECKING: + from uptrain.framework import Settings +from uptrain.operators.base import * +from uptrain.utilities import lazy_load_dep + +# Hypothetical function to compute MT-Bench score +def compute_mtbench_score(generated_dialogue: list[str], source_dialogue: list[str]) -> float: + # Example logic for MT-Bench score calculation + # Replace this with the actual MT-Bench scoring logic + score = sum(1 for gen, src in zip(generated_dialogue, source_dialogue) if gen == src) / len(source_dialogue) + return score + +@register_op +class MTBenchScore(ColumnOp): + """ + Operator to compare generated multi-turn dialogues with source dialogues using the MT-Bench score metric. + + Attributes: + col_in_generated (str): The name of the input column containing the generated dialogues. + col_in_source (str): The name of the input column containing the source dialogues. + col_out (str): The name of the output column containing the MT-Bench scores. + + Returns: + dict: A dictionary containing the MT-Bench scores for each pair of generated and source dialogues. + + Example: + ``` + import polars as pl + from uptrain.operators import MTBenchScore + + # Create a DataFrame + df = pl.DataFrame({ + "dialogue_generated": [["Hi", "Hello"], ["How are you?", "I'm fine, thanks."]], + "dialogue_source": [["Hello", "Hi"], ["How do you do?", "I'm good, thank you."]] + }) + + # Create an instance of the MTBenchScore class + mtbench_op = MTBenchScore() + + # Calculate the MT-Bench scores + scores = mtbench_op.run(df)["output"] + + # Print the MT-Bench scores + print(scores["mtbench_score"]) + ``` + + Output: + ``` + shape: (2,) + Series: 'mtbench_score' [f64] + [ + 0.5 + 0.0 + ]) + ``` + + """ + + col_in_generated: str = "dialogue_generated" + col_in_source: str = "dialogue_source" + col_out: str = "mtbench_score" + + def setup(self, settings: Settings): + return self + + def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: + dialogues_generated = data.get_column(self.col_in_generated).to_list() # candidate/preds + dialogues_source = data.get_column(self.col_in_source).to_list() # reference/target + + scores = [] + for generated, source in zip(dialogues_generated, dialogues_source): + if source is None or generated is None: + scores.append(0.0) + else: + score = compute_mtbench_score(generated, source) + scores.append(score) + + results = pl.Series(scores, name=self.col_out) + return {"output": data.with_columns([results])}