From 73bc6aeb3027709e26f538a1aa6d8d5c7638d7ad Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Mon, 27 Jan 2025 08:59:21 +0100 Subject: [PATCH 1/5] Add custom task (bac-fr) for evaluation of models in french --- community_tasks/french_evals.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index 7fda8dc7a..134fa63c2 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -80,6 +80,18 @@ def prompt_gpqa_fr(line, task_name: str = None): ) +# BAC-fr prompt function +def prompt_bac_fr(line, task_name: str = None): + return Doc( + task_name=task_name, + query=line["prompt"], + choices=[""], + gold_index=0, + instruction="", + specific={"instruction": line["instruction"], "enonce": line["enonce"]}, + ) + + # IFEVal-fr task @@ -117,5 +129,23 @@ def prompt_gpqa_fr(line, task_name: str = None): version=0, ) +# BAC-fr task +bac_fr_task = LightevalTaskConfig( + name="bac-fr", + suite=["community"], + prompt_function=prompt_bac_fr, + hf_repo="fr-gouv-coordination-ia/bac-fr", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=[], # To be defined + stop_sequence=["\n"], + trust_dataset=True, + version=0, +) + # STORE YOUR EVALS -TASKS_TABLE = [ifeval_fr_task, gpqa_fr_task] +TASKS_TABLE = [ifeval_fr_task, gpqa_fr_task, bac_fr_task] From c592de7cd387af0cd34f22186ab1a23da8152840 Mon Sep 17 00:00:00 2001 From: mdiazmel Date: Mon, 27 Jan 2025 13:52:53 +0100 Subject: [PATCH 2/5] Update prompt function for the bac-fr task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/french_evals.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index 134fa63c2..3e8dd9777 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -82,14 +82,24 @@ def prompt_gpqa_fr(line, task_name: str = None): # BAC-fr prompt function def prompt_bac_fr(line, task_name: str = None): - return Doc( - task_name=task_name, - query=line["prompt"], - choices=[""], - gold_index=0, - instruction="", - specific={"instruction": line["instruction"], "enonce": line["enonce"]}, - ) + prompt = f"Enoncé: {line['enonce"]}\n{line['instruction'}\n" + if line["Choix"] is not None: # Multichoice evaluation + prompt += "\n".join([f"{LETTER_INDICES[ix]}.{choix}" for ix, choix in enumerate(line["choix"])]) + return Doc( + task_name = task_name, + query = prompt, + choices = aslist(line["Choix"]) + gold_index = line["Choix"].index(line["Choix correct"]) + instruction = "" + ) + else: + return Doc( + task_name = task_name, + query = prompt, + choices = [line["reponse"]] + gold_index = 0 + instruction = "" + ) # IFEVal-fr task From 6b4de76473849e1780c463212979d534b4e4375c Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Tue, 28 Jan 2025 21:27:21 +0100 Subject: [PATCH 3/5] Add metrics for the evaluation of bac-fr --- community_tasks/french_evals.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index 3e8dd9777..f9758e589 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -82,24 +82,18 @@ def prompt_gpqa_fr(line, task_name: str = None): # BAC-fr prompt function def prompt_bac_fr(line, task_name: str = None): - prompt = f"Enoncé: {line['enonce"]}\n{line['instruction'}\n" - if line["Choix"] is not None: # Multichoice evaluation + prompt = f"Enoncé: {line['enonce']}\n{line['instruction']}\n" + if line["Choix"] is not None: # Multichoice evaluation prompt += "\n".join([f"{LETTER_INDICES[ix]}.{choix}" for ix, choix in enumerate(line["choix"])]) return Doc( - task_name = task_name, - query = prompt, - choices = aslist(line["Choix"]) - gold_index = line["Choix"].index(line["Choix correct"]) - instruction = "" - ) - else: - return Doc( - task_name = task_name, - query = prompt, - choices = [line["reponse"]] - gold_index = 0 - instruction = "" + task_name=task_name, + query=prompt, + choices=list(line["Choix"]), + gold_index=line["Choix"].index(line["Choix correct"]), + instruction="", ) + else: + return Doc(task_name=task_name, query=prompt, choices=[line["reponse"]], gold_index=0, instruction="") # IFEVal-fr task @@ -151,7 +145,7 @@ def prompt_bac_fr(line, task_name: str = None): few_shots_split=None, few_shots_select="random_sampling", generation_size=1, - metric=[], # To be defined + metric=[Metrics.quasi_exact_match_math, Metrics.exact_match], stop_sequence=["\n"], trust_dataset=True, version=0, From c791903be02bcd0275ae2e2fc65900b085457edf Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Wed, 29 Jan 2025 09:58:17 +0100 Subject: [PATCH 4/5] Fix function name (as_list) --- community_tasks/french_evals.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index f9758e589..52bfc24e9 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -46,6 +46,7 @@ from lighteval.tasks.extended.ifeval.main import ifeval_metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +from lighteval.utils.utils import as_list # Ifeval-fr prompt function @@ -88,7 +89,7 @@ def prompt_bac_fr(line, task_name: str = None): return Doc( task_name=task_name, query=prompt, - choices=list(line["Choix"]), + choices=as_list(line["Choix"]), gold_index=line["Choix"].index(line["Choix correct"]), instruction="", ) From fa123e890ded64c97b46b595afc06f4dc1f5f204 Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Wed, 29 Jan 2025 17:39:10 +0100 Subject: [PATCH 5/5] Fix prompt for multichoice --- community_tasks/french_evals.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py index 52bfc24e9..80699f07e 100644 --- a/community_tasks/french_evals.py +++ b/community_tasks/french_evals.py @@ -84,13 +84,13 @@ def prompt_gpqa_fr(line, task_name: str = None): # BAC-fr prompt function def prompt_bac_fr(line, task_name: str = None): prompt = f"Enoncé: {line['enonce']}\n{line['instruction']}\n" - if line["Choix"] is not None: # Multichoice evaluation - prompt += "\n".join([f"{LETTER_INDICES[ix]}.{choix}" for ix, choix in enumerate(line["choix"])]) + if line["choix"] is not None: # Multichoice evaluation + # prompt += "\n".join([f"{LETTER_INDICES[ix]}.{choix}" for ix, choix in enumerate(line["choix"])]) return Doc( task_name=task_name, query=prompt, - choices=as_list(line["Choix"]), - gold_index=line["Choix"].index(line["Choix correct"]), + choices=as_list(line["choix"]), + gold_index=line["choix"].index(line["choix correct"]), instruction="", ) else: