From b93925fd038a068a64739a46517d706f85b43bbf Mon Sep 17 00:00:00 2001 From: Maria Rosario Mestre Date: Mon, 22 Jul 2024 14:38:38 +0100 Subject: [PATCH] notebook to get ground-truth relevant documents --- notebooks/get-ground-truth-data.ipynb | 1229 +++++++++++++++++++++++++ 1 file changed, 1229 insertions(+) create mode 100644 notebooks/get-ground-truth-data.ipynb diff --git a/notebooks/get-ground-truth-data.ipynb b/notebooks/get-ground-truth-data.ipynb new file mode 100644 index 0000000..c70f63d --- /dev/null +++ b/notebooks/get-ground-truth-data.ipynb @@ -0,0 +1,1229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "52691fa1-445c-4929-8357-b25059409799", + "metadata": {}, + "source": [ + "In this notebook, we will generate a dataset of ground-truth documents to evaluate retrievers using pairs of ground-truth question-answers. Specifically, we will compare the following 3 retrieval techniques:\n", + "- a retriever based on embedding vector similarity using a sentence transformer model from HuggingFace: [msmarco model](https://huggingface.co/sentence-transformers/msmarco-distilroberta-base-v2)\n", + "- similar to the above but using a different model: [mnet model](https://huggingface.co/sentence-transformers/all-mpnet-base-v2).\n", + "- a BM25 keyword-based [retriever](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever).\n", + "\n", + "In order to generate the dataset of ground-truth documents, these are the steps:\n", + "- we first load the dataset (we use the ARAGOG dataset from this [repository ](https://github.com/deepset-ai/haystack-evaluation),\n", + "- we then index the files from the dataset using the 2 embedding models above,\n", + "- we define 3 retriever Haystack pipelines for the 3 retrieval techniques above,\n", + "- we run the pipelines using _both_ the query and the ground-truth answer to generate a list of candidate documents,\n", + "- we then use a LLM to evaluate each candidate document with respect to the question and answer and assign it a label for full match, partial match or no match,\n", + "- we do some processing of the data and save it." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a54822f9-bcd8-4182-a813-8919fe6dacaa", + "metadata": {}, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "from IPython.display import display\n", + "import json\n", + "import os\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import sys\n", + "from typing import Any, Dict, List, Optional\n", + "\n", + "from haystack import component, Pipeline, Document\n", + "from haystack.components.builders import PromptBuilder, AnswerBuilder\n", + "from haystack.components.converters import PyPDFToDocument\n", + "from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder\n", + "from haystack.components.evaluators import SASEvaluator\n", + "from haystack.components.evaluators.llm_evaluator import LLMEvaluator\n", + "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", + "from haystack.components.retrievers import InMemoryEmbeddingRetriever, InMemoryBM25Retriever\n", + "from haystack.components.writers import DocumentWriter\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "from haystack.document_stores.types import DuplicatePolicy\n", + "from haystack.utils import Secret" + ] + }, + { + "cell_type": "markdown", + "id": "2640c061-0244-406d-96c1-b30b1fd1c8b3", + "metadata": {}, + "source": [ + "# Configs" + ] + }, + { + "cell_type": "markdown", + "id": "f7d0a514-2903-4008-9db6-13a4f92595aa", + "metadata": {}, + "source": [ + "We set up some configuration parameters here, including the path to the evaluation repository (https://github.com/deepset-ai/haystack-evaluation) with the dataset that we will use." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "13fc63cb-25fa-44a4-a993-8702da0bcfa3", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_colwidth', 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fc867378-5858-49fa-9ce9-677cd8a71653", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "OPENAI_API_KEY: ········\n" + ] + } + ], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OPENAI_API_KEY: \")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7c82d0cc-6427-4e3b-9a7b-70c381ece80c", + "metadata": {}, + "outputs": [], + "source": [ + "EVALUATION_REPO = Path(f\"{os.environ['HOME']}/Developer/haystack-evaluation/\")\n", + "FILEPATHS = EVALUATION_REPO / \"datasets/ARAGOG/\"\n", + "QA_PATH = FILEPATHS / \"eval_questions_relevant_doc.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "080ef76d-d215-47e8-ae1a-e4f77cb07023", + "metadata": {}, + "outputs": [], + "source": [ + "# We will keep the number of documents retrieved fixed in this notebook.\n", + "TOP_K = 3" + ] + }, + { + "cell_type": "markdown", + "id": "d835cb39-3e42-499f-83c5-939d48d7f5d5", + "metadata": {}, + "source": [ + "# Load the data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a6d57354-4815-4cbf-83a7-6d481625b2de", + "metadata": {}, + "outputs": [], + "source": [ + "question_answers = json.load(open(QA_PATH, \"r\"))\n", + "QUESTIONS = question_answers['questions']\n", + "ANSWERS = question_answers['ground_truths']\n", + "GROUND_TRUTH_FILENAMES = question_answers['filepaths']" + ] + }, + { + "cell_type": "markdown", + "id": "fd8d6f55-6ba7-4681-b45c-3b77afd428b9", + "metadata": {}, + "source": [ + "# Retrieve documents using different retrieval techniques" + ] + }, + { + "cell_type": "markdown", + "id": "231c932d-abc7-4892-acf1-8a31f9ea536b", + "metadata": {}, + "source": [ + "## Define the indexing pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "cea32847-8bb3-4652-8694-34eac89a4a24", + "metadata": {}, + "source": [ + "The function below is based off a version from [here](https://github.com/deepset-ai/haystack-evaluation/blob/6db15f828628a1f31cd54d8657345aaa870cf40e/evaluations/evaluation_aragog.py#L26).\n", + "\n", + "This function creates an indexing pipeline which reads the input PDF files, splits them into documents of size 3 sentences with an overlap of 1 sentence, and embeds them with a [sentence transformer](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5520a958-4f80-400a-98d9-7333d4980f3d", + "metadata": {}, + "outputs": [], + "source": [ + "def embedding_indexing(embedding_model: str):\n", + " full_path = Path(FILEPATHS)\n", + " files_path = full_path / \"papers_for_questions\"\n", + " document_store = InMemoryDocumentStore()\n", + " pipeline = Pipeline()\n", + " pipeline.add_component(\"converter\", PyPDFToDocument())\n", + " pipeline.add_component(\"cleaner\", DocumentCleaner())\n", + " pipeline.add_component(\"splitter\", DocumentSplitter(split_by=\"sentence\", split_length=3, split_overlap=1)) # splitting by word\n", + " pipeline.add_component(\"writer\", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))\n", + " pipeline.add_component(\"embedder\", SentenceTransformersDocumentEmbedder(embedding_model))\n", + " pipeline.connect(\"converter\", \"cleaner\")\n", + " pipeline.connect(\"cleaner\", \"splitter\")\n", + " pipeline.connect(\"splitter\", \"embedder\")\n", + " pipeline.connect(\"embedder\", \"writer\")\n", + " pdf_files = [full_path / \"papers_for_questions\" / f_name for f_name in os.listdir(files_path)]\n", + " pipeline.run({\"converter\": {\"sources\": pdf_files}})\n", + "\n", + " return document_store" + ] + }, + { + "cell_type": "markdown", + "id": "e2001840-5707-48f5-a92a-d74034c91533", + "metadata": {}, + "source": [ + "## Define retriever pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "8cd90a11-85af-492b-b2cc-45b2b72fea16", + "metadata": {}, + "source": [ + "The function below takes an embedding model and a document store and defines a retriever pipeline, which will take a user query, embed it and use the query embedding to retrieve relevant documents from the document store." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "167df891-763c-4a39-8f86-d4efd34654e6", + "metadata": {}, + "outputs": [], + "source": [ + "def get_retriever_embedding_pipeline(embedding_model, doc_store):\n", + " retriever_embedding_pipeline = Pipeline()\n", + " retriever_embedding_pipeline.add_component(\"query_embedder\", SentenceTransformersTextEmbedder(\n", + " model=embedding_model, progress_bar=False\n", + " ))\n", + " \n", + " retriever_embedding_pipeline.add_component(\"retriever\", InMemoryEmbeddingRetriever(doc_store, top_k=TOP_K))\n", + " \n", + " retriever_embedding_pipeline.connect(\"query_embedder\", \"retriever.query_embedding\")\n", + " return retriever_embedding_pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "279c93cb-1a0e-410c-a914-68a46846e545", + "metadata": {}, + "source": [ + "We now run indexing of documents using two different sentence transformer models from HuggingFace:\n", + "- the [msmarco model](https://huggingface.co/sentence-transformers/msmarco-distilroberta-base-v2)\n", + "- the [mnet model](https://huggingface.co/sentence-transformers/all-mpnet-base-v2).\n", + "\n", + "We also define the retriever pipelines for the document stores created with the two models above." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f9131044-c28b-4291-a232-62ea09ca1b0a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ea88023eb6a9406784588c2d08ee11a0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/360 [00:00 Dict[str, Any]:\n", + " \"\"\"\n", + " Run the LLM evaluator on the contexts.\n", + " \"\"\"\n", + " all_results = []\n", + " for question, answer, retrieved_contexts in zip(questions, answers, contexts):\n", + " question_results = []\n", + " for retrieved_context in retrieved_contexts:\n", + " result = super(SingleContextRelevanceEvaluator, self).run(questions=[question],\n", + " contexts=[retrieved_context],\n", + " answers=[answer])\n", + " try:\n", + " answer = result['results'][0][\"response\"]\n", + " except:\n", + " answer = \"error\"\n", + " question_results.append(answer)\n", + " all_results.append(question_results)\n", + "\n", + " return {\"results\": all_results}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2839e5d-cfdc-44af-a354-0a5e00787c27", + "metadata": {}, + "outputs": [], + "source": [ + "evaluator = SingleContextRelevanceEvaluator(raise_on_failure=False)\n", + "\n", + "evaluator.generator = OpenAIGenerator(\n", + " model=\"gpt-4-turbo\",\n", + " generation_kwargs={\"response_format\": {\"type\": \"json_object\"}, \"seed\": 42},\n", + "\n", + ")\n", + "\n", + "results = evaluator.run(questions=questions, \n", + " contexts=contexts_contents, \n", + " answers=answers)" + ] + }, + { + "cell_type": "markdown", + "id": "1a69d601-0e7e-40e2-be28-6230e27b2dac", + "metadata": {}, + "source": [ + "Process the list of candidate documents by adding the LLM labels." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d38d5bb2-6372-4e89-9450-4108e9525447", + "metadata": {}, + "outputs": [], + "source": [ + "candidate_docs_with_LLM_scores = []\n", + "\n", + "for dataset_row, row_context_results in zip(candidate_docs, results['results']):\n", + " new_row = dataset_row.copy()\n", + " contexts_with_scores = []\n", + " for context, context_score in zip(dataset_row['docs'], row_context_results):\n", + " context['LLM_judgment'] = context_score\n", + " contexts_with_scores.append(context)\n", + " new_row['docs'] = contexts_with_scores\n", + " candidate_docs_with_LLM_scores.append(new_row)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "1997d497-e7eb-4ffd-8adf-195730f2fb6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'What model sizes are reported for BERT, and what are their specifications?',\n", + " 'answer': 'BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M).',\n", + " 'filename': 'bert.pdf',\n", + " 'docs': [{'content': '3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).\\nBERT BASE was chosen to have the same model\\nsize as OpenAI GPT for comparison purposes.\\nCritically, however, the BERT Transformer uses\\nbidirectional self-attention, while the GPT Trans-\\nformer uses constrained self-attention where every\\ntoken can only attend to context to its left.',\n", + " 'score': 194.82163561646738,\n", + " 'method': 'embedding msmarco',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': ', 2018). By contrast, BERT BASE\\ncontains 110M parameters and BERT LARGE con-\\ntains 340M parameters.\\nIt has long been known that increasing the\\nmodel size will lead to continual improvements\\non large-scale tasks such as machine translation\\nand language modeling, which is demonstrated\\nby the LM perplexity of held-out training data\\nshown in Table 6.',\n", + " 'score': 181.8022687235342,\n", + " 'method': 'embedding msmarco',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': 'e., Transformer blocks) as L, the hidden size as\\nH, and the number of self-attention heads as A.3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).',\n", + " 'score': 181.50001496180622,\n", + " 'method': 'embedding msmarco',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': '8 93.7\\nTable 6: Ablation over BERT model size. #L = the\\nnumber of layers; #H = hidden size; #A = number of at-\\ntention heads.',\n", + " 'score': 0.6541984736828388,\n", + " 'method': 'embedding mnet',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': '\\nTraining of BERT BASE was performed on 4\\nCloud TPUs in Pod configuration (16 TPU chips\\ntotal).13Training of BERT LARGE was performed\\non 16 Cloud TPUs (64 TPU chips total). Each pre-\\ntraining took 4 days to complete.',\n", + " 'score': 0.5342425096311341,\n", + " 'method': 'embedding mnet',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': '9 93.3\\n24 1024 16 3.23 86.',\n", + " 'score': 73.56936802252953,\n", + " 'method': 'BM25',\n", + " 'LLM_judgment': 'no'},\n", + " {'content': ' F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and\\naccuracy scores are reported for the other tasks. We exclude entries that use BERT as one of their components.\\nWe use a batch size of 32 and fine-tune for 3\\nepochs over the data for all GLUE tasks.',\n", + " 'score': 29.533500736317514,\n", + " 'method': 'BM25',\n", + " 'LLM_judgment': 'no'},\n", + " {'content': ' The “Average” column is slightly different\\nthan the official GLUE score, since we exclude the problematic WNLI set.8BERT and OpenAI GPT are single-\\nmodel, single task. F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and\\naccuracy scores are reported for the other tasks.',\n", + " 'score': 27.849829888401317,\n", + " 'method': 'BM25',\n", + " 'LLM_judgment': 'no'},\n", + " {'content': ' Note that\\nBERT BASE and OpenAI GPT are nearly identical\\nin terms of model architecture apart from the at-\\ntention masking. For the largest and most widely\\nreported GLUE task, MNLI, BERT obtains a 4.6%\\nabsolute accuracy improvement.',\n", + " 'score': 27.043483442059646,\n", + " 'method': 'BM25',\n", + " 'LLM_judgment': 'no'}]}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "candidate_docs_with_LLM_scores[1]" + ] + }, + { + "cell_type": "markdown", + "id": "e00c1de3-e328-42aa-8146-891f099a1419", + "metadata": {}, + "source": [ + "## Clean & save the dataset" + ] + }, + { + "cell_type": "markdown", + "id": "fbe26e89-094a-4298-a5e5-840f6dd2ed32", + "metadata": {}, + "source": [ + "In this section, we remove any QA pair that has no document labelled as relevant by the LLM. We also sort the documents, so that documents that have been scored as \"full\" match rank higher than the ones that have been given a \"partial\" match. This is relevant for some of the retrieval metrics later used." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "2fa5d4dd-327a-41d2-997c-bad577cdcc2c", + "metadata": {}, + "outputs": [], + "source": [ + "def prioritise_scores(score):\n", + " if score == \"full\":\n", + " return 0\n", + " if score == \"partial\":\n", + " return 1\n", + " return 2" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6fdc1702-1792-4295-b430-78e40996cda0", + "metadata": {}, + "outputs": [], + "source": [ + "final_dataset = []\n", + "for row in candidate_docs_with_LLM_scores:\n", + " new_row = row.copy()\n", + " docs = row['docs']\n", + " filtered_docs = [doc for doc in docs if doc['LLM_judgment'] in [\"full\", \"partial\"]]\n", + " sorted_docs = sorted(filtered_docs, key=lambda x: prioritise_scores(x['LLM_judgment']))\n", + " if len(sorted_docs):\n", + " new_row['docs'] = sorted_docs\n", + " final_dataset.append(new_row)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "39f8c11c-994b-4ba0-8ec6-1e1325b4bc9e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'What model sizes are reported for BERT, and what are their specifications?',\n", + " 'answer': 'BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M).',\n", + " 'filename': 'bert.pdf',\n", + " 'docs': [{'content': '3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).\\nBERT BASE was chosen to have the same model\\nsize as OpenAI GPT for comparison purposes.\\nCritically, however, the BERT Transformer uses\\nbidirectional self-attention, while the GPT Trans-\\nformer uses constrained self-attention where every\\ntoken can only attend to context to its left.',\n", + " 'score': 194.82163561646738,\n", + " 'method': 'embedding msmarco',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': ', 2018). By contrast, BERT BASE\\ncontains 110M parameters and BERT LARGE con-\\ntains 340M parameters.\\nIt has long been known that increasing the\\nmodel size will lead to continual improvements\\non large-scale tasks such as machine translation\\nand language modeling, which is demonstrated\\nby the LM perplexity of held-out training data\\nshown in Table 6.',\n", + " 'score': 181.8022687235342,\n", + " 'method': 'embedding msmarco',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': 'e., Transformer blocks) as L, the hidden size as\\nH, and the number of self-attention heads as A.3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).',\n", + " 'score': 181.50001496180622,\n", + " 'method': 'embedding msmarco',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': '8 93.7\\nTable 6: Ablation over BERT model size. #L = the\\nnumber of layers; #H = hidden size; #A = number of at-\\ntention heads.',\n", + " 'score': 0.6541984736828388,\n", + " 'method': 'embedding mnet',\n", + " 'LLM_judgment': 'full'},\n", + " {'content': '\\nTraining of BERT BASE was performed on 4\\nCloud TPUs in Pod configuration (16 TPU chips\\ntotal).13Training of BERT LARGE was performed\\non 16 Cloud TPUs (64 TPU chips total). Each pre-\\ntraining took 4 days to complete.',\n", + " 'score': 0.5342425096311341,\n", + " 'method': 'embedding mnet',\n", + " 'LLM_judgment': 'full'}]}" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_dataset[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "9ea67a2a-eaa3-4363-8a8d-473ba0627e65", + "metadata": {}, + "outputs": [], + "source": [ + "# json.dump(final_dataset, open(\"final_dataset.json\", \"w\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}