From b93925fd038a068a64739a46517d706f85b43bbf Mon Sep 17 00:00:00 2001
From: Maria Rosario Mestre <maria@Marias-MacBook-Pro.local>
Date: Mon, 22 Jul 2024 14:38:38 +0100
Subject: [PATCH] notebook to get ground-truth relevant documents

---
 notebooks/get-ground-truth-data.ipynb | 1229 +++++++++++++++++++++++++
 1 file changed, 1229 insertions(+)
 create mode 100644 notebooks/get-ground-truth-data.ipynb

diff --git a/notebooks/get-ground-truth-data.ipynb b/notebooks/get-ground-truth-data.ipynb
new file mode 100644
index 0000000..c70f63d
--- /dev/null
+++ b/notebooks/get-ground-truth-data.ipynb
@@ -0,0 +1,1229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "52691fa1-445c-4929-8357-b25059409799",
+   "metadata": {},
+   "source": [
+    "In this notebook, we will generate a dataset of ground-truth documents to evaluate retrievers using pairs of ground-truth question-answers. Specifically, we will compare the following 3 retrieval techniques:\n",
+    "- a retriever based on embedding vector similarity using a sentence transformer model from HuggingFace: [msmarco model](https://huggingface.co/sentence-transformers/msmarco-distilroberta-base-v2)\n",
+    "- similar to the above but using a different model: [mnet model](https://huggingface.co/sentence-transformers/all-mpnet-base-v2).\n",
+    "- a BM25 keyword-based [retriever](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever).\n",
+    "\n",
+    "In order to generate the dataset of ground-truth documents, these are the steps:\n",
+    "- we first load the dataset (we use the ARAGOG dataset from this [repository ](https://github.com/deepset-ai/haystack-evaluation),\n",
+    "- we then index the files from the dataset using the 2 embedding models above,\n",
+    "- we define 3 retriever Haystack pipelines for the 3 retrieval techniques above,\n",
+    "- we run the pipelines using _both_ the query and the ground-truth answer to generate a list of candidate documents,\n",
+    "- we then use a LLM to evaluate each candidate document with respect to the question and answer and assign it a label for full match, partial match or no match,\n",
+    "- we do some processing of the data and save it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a54822f9-bcd8-4182-a813-8919fe6dacaa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from getpass import getpass\n",
+    "from IPython.display import display\n",
+    "import json\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "import sys\n",
+    "from typing import Any, Dict, List, Optional\n",
+    "\n",
+    "from haystack import component, Pipeline, Document\n",
+    "from haystack.components.builders import PromptBuilder, AnswerBuilder\n",
+    "from haystack.components.converters import PyPDFToDocument\n",
+    "from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder\n",
+    "from haystack.components.evaluators import SASEvaluator\n",
+    "from haystack.components.evaluators.llm_evaluator import LLMEvaluator\n",
+    "from haystack.components.generators import OpenAIGenerator\n",
+    "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n",
+    "from haystack.components.retrievers import InMemoryEmbeddingRetriever, InMemoryBM25Retriever\n",
+    "from haystack.components.writers import DocumentWriter\n",
+    "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
+    "from haystack.document_stores.types import DuplicatePolicy\n",
+    "from haystack.utils import Secret"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2640c061-0244-406d-96c1-b30b1fd1c8b3",
+   "metadata": {},
+   "source": [
+    "# Configs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7d0a514-2903-4008-9db6-13a4f92595aa",
+   "metadata": {},
+   "source": [
+    "We set up some configuration parameters here, including the path to the evaluation repository (https://github.com/deepset-ai/haystack-evaluation) with the dataset that we will use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "13fc63cb-25fa-44a4-a993-8702da0bcfa3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.max_colwidth', 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fc867378-5858-49fa-9ce9-677cd8a71653",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "OPENAI_API_KEY:  ········\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OPENAI_API_KEY: \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7c82d0cc-6427-4e3b-9a7b-70c381ece80c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EVALUATION_REPO = Path(f\"{os.environ['HOME']}/Developer/haystack-evaluation/\")\n",
+    "FILEPATHS = EVALUATION_REPO / \"datasets/ARAGOG/\"\n",
+    "QA_PATH = FILEPATHS / \"eval_questions_relevant_doc.json\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "080ef76d-d215-47e8-ae1a-e4f77cb07023",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We will keep the number of documents retrieved fixed in this notebook.\n",
+    "TOP_K = 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d835cb39-3e42-499f-83c5-939d48d7f5d5",
+   "metadata": {},
+   "source": [
+    "# Load the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a6d57354-4815-4cbf-83a7-6d481625b2de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question_answers = json.load(open(QA_PATH, \"r\"))\n",
+    "QUESTIONS = question_answers['questions']\n",
+    "ANSWERS = question_answers['ground_truths']\n",
+    "GROUND_TRUTH_FILENAMES = question_answers['filepaths']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd8d6f55-6ba7-4681-b45c-3b77afd428b9",
+   "metadata": {},
+   "source": [
+    "# Retrieve documents using different retrieval techniques"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "231c932d-abc7-4892-acf1-8a31f9ea536b",
+   "metadata": {},
+   "source": [
+    "## Define the indexing pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cea32847-8bb3-4652-8694-34eac89a4a24",
+   "metadata": {},
+   "source": [
+    "The function below is based off a version from [here](https://github.com/deepset-ai/haystack-evaluation/blob/6db15f828628a1f31cd54d8657345aaa870cf40e/evaluations/evaluation_aragog.py#L26).\n",
+    "\n",
+    "This function creates an indexing pipeline which reads the input PDF files, splits them into documents of size 3 sentences with an overlap of 1 sentence, and embeds them with a [sentence transformer](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5520a958-4f80-400a-98d9-7333d4980f3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def embedding_indexing(embedding_model: str):\n",
+    "    full_path = Path(FILEPATHS)\n",
+    "    files_path = full_path / \"papers_for_questions\"\n",
+    "    document_store = InMemoryDocumentStore()\n",
+    "    pipeline = Pipeline()\n",
+    "    pipeline.add_component(\"converter\", PyPDFToDocument())\n",
+    "    pipeline.add_component(\"cleaner\", DocumentCleaner())\n",
+    "    pipeline.add_component(\"splitter\", DocumentSplitter(split_by=\"sentence\", split_length=3, split_overlap=1))  # splitting by word\n",
+    "    pipeline.add_component(\"writer\", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))\n",
+    "    pipeline.add_component(\"embedder\", SentenceTransformersDocumentEmbedder(embedding_model))\n",
+    "    pipeline.connect(\"converter\", \"cleaner\")\n",
+    "    pipeline.connect(\"cleaner\", \"splitter\")\n",
+    "    pipeline.connect(\"splitter\", \"embedder\")\n",
+    "    pipeline.connect(\"embedder\", \"writer\")\n",
+    "    pdf_files = [full_path / \"papers_for_questions\" / f_name for f_name in os.listdir(files_path)]\n",
+    "    pipeline.run({\"converter\": {\"sources\": pdf_files}})\n",
+    "\n",
+    "    return document_store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2001840-5707-48f5-a92a-d74034c91533",
+   "metadata": {},
+   "source": [
+    "## Define retriever pipelines"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8cd90a11-85af-492b-b2cc-45b2b72fea16",
+   "metadata": {},
+   "source": [
+    "The function below takes an embedding model and a document store and defines a retriever pipeline, which will take a user query, embed it and use the query embedding to retrieve relevant documents from the document store."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "167df891-763c-4a39-8f86-d4efd34654e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_retriever_embedding_pipeline(embedding_model, doc_store):\n",
+    "    retriever_embedding_pipeline = Pipeline()\n",
+    "    retriever_embedding_pipeline.add_component(\"query_embedder\", SentenceTransformersTextEmbedder(\n",
+    "        model=embedding_model, progress_bar=False\n",
+    "    ))\n",
+    "    \n",
+    "    retriever_embedding_pipeline.add_component(\"retriever\", InMemoryEmbeddingRetriever(doc_store, top_k=TOP_K))\n",
+    "    \n",
+    "    retriever_embedding_pipeline.connect(\"query_embedder\", \"retriever.query_embedding\")\n",
+    "    return retriever_embedding_pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "279c93cb-1a0e-410c-a914-68a46846e545",
+   "metadata": {},
+   "source": [
+    "We now run indexing of documents using two different sentence transformer models from HuggingFace:\n",
+    "- the [msmarco model](https://huggingface.co/sentence-transformers/msmarco-distilroberta-base-v2)\n",
+    "- the [mnet model](https://huggingface.co/sentence-transformers/all-mpnet-base-v2).\n",
+    "\n",
+    "We also define the retriever pipelines for the document stores created with the two models above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f9131044-c28b-4291-a232-62ea09ca1b0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ea88023eb6a9406784588c2d08ee11a0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/360 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fdb418efdd7644de9cb61e280ed00312",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/360 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "doc_store_msmarco = embedding_indexing(\"sentence-transformers/msmarco-distilroberta-base-v2\")\n",
+    "retriever_msmarco_pipeline = get_retriever_embedding_pipeline(\"sentence-transformers/msmarco-distilroberta-base-v2\", doc_store_msmarco)\n",
+    "\n",
+    "doc_store_mnet = embedding_indexing(\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "retriever_mnet_pipeline = get_retriever_embedding_pipeline(\"sentence-transformers/all-mpnet-base-v2\", doc_store_mnet)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab22d1e6-cf67-4234-9647-0ad8a732ad18",
+   "metadata": {},
+   "source": [
+    "We also define a retriever pipeline using a keyword-based BM25 retriever."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "38e802a3-069e-4fb5-a869-a1684d0a000c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retriever_bm25 = InMemoryBM25Retriever(doc_store_msmarco, top_k=TOP_K)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ca86a6bc-da9d-478c-b6a5-ade6f785aa93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Published as a conference paper at ICLR 2021\n",
+      "MEASURING MASSIVE MULTITASK\n",
+      "LANGUAGE UNDERSTANDING\n",
+      "Dan Hendrycks\n",
+      "UC BerkeleyCollin Burns\n",
+      "Columbia UniversitySteven Basart\n",
+      "UChicagoAndy Zou\n",
+      "UC Berkeley\n",
+      "Mantas Mazeika\n",
+      "UIUCDawn Song\n",
+      "UC BerkeleyJacob Steinhardt\n",
+      "UC Berkeley\n",
+      "ABSTRACT\n",
+      "We propose a new test to measure a text model’s multitask accuracy. The test\n",
+      "covers 57 tasks including elementary mathematics, US history, computer science,\n",
+      "law, and more. To attain high accuracy on this test, models must possess extensive\n",
+      "world knowledge and problem solving ability.\n",
+      " To attain high accuracy on this test, models must possess extensive\n",
+      "world knowledge and problem solving ability. We ﬁnd that while most recent\n",
+      "models have near random-chance accuracy, the very largest GPT-3 model improves\n",
+      "over random chance by almost 20 percentage points on average. However, on every\n",
+      "one of the 57 tasks, the best models still need substantial improvements before\n",
+      "they can reach expert-level accuracy.\n"
+     ]
+    }
+   ],
+   "source": [
+    "for doc in doc_store_msmarco.filter_documents()[:2]:\n",
+    "    print(doc.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b370b53e-2cc9-4638-af65-ebea123c25dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Published as a conference paper at ICLR 2021\n",
+      "MEASURING MASSIVE MULTITASK\n",
+      "LANGUAGE UNDERSTANDING\n",
+      "Dan Hendrycks\n",
+      "UC BerkeleyCollin Burns\n",
+      "Columbia UniversitySteven Basart\n",
+      "UChicagoAndy Zou\n",
+      "UC Berkeley\n",
+      "Mantas Mazeika\n",
+      "UIUCDawn Song\n",
+      "UC BerkeleyJacob Steinhardt\n",
+      "UC Berkeley\n",
+      "ABSTRACT\n",
+      "We propose a new test to measure a text model’s multitask accuracy. The test\n",
+      "covers 57 tasks including elementary mathematics, US history, computer science,\n",
+      "law, and more. To attain high accuracy on this test, models must possess extensive\n",
+      "world knowledge and problem solving ability.\n",
+      " To attain high accuracy on this test, models must possess extensive\n",
+      "world knowledge and problem solving ability. We ﬁnd that while most recent\n",
+      "models have near random-chance accuracy, the very largest GPT-3 model improves\n",
+      "over random chance by almost 20 percentage points on average. However, on every\n",
+      "one of the 57 tasks, the best models still need substantial improvements before\n",
+      "they can reach expert-level accuracy.\n"
+     ]
+    }
+   ],
+   "source": [
+    "for doc in doc_store_mnet.filter_documents()[:2]:\n",
+    "    print(doc.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6542f4a6-7a12-46b5-abb0-fc16cd404a19",
+   "metadata": {},
+   "source": [
+    "## Get candidate retrieved documents"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3089490d-302d-4c6b-8e62-882c8edeae62",
+   "metadata": {},
+   "source": [
+    "Here we will add some functions to retrieve documents given a retriever pipeline that works with an embedding model in `get_reponses`, as well as another function to retrieve documents using the BM25 technique in the `get_responses_bm25` function below.\n",
+    "\n",
+    "We run the retriever on two queries, one that only contains the original question, and a second query that is a concatenation of the question and ground-truth answer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "125b2b98-f836-4a44-bcf6-b9afb52b20b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_filters_dict(filename):\n",
+    "    filters = {'operator': 'AND', \n",
+    "                'conditions': [{'field': 'meta.file_path',\n",
+    "                               'value': str(FILEPATHS / f'papers_for_questions/{filename}'),\n",
+    "                               'operator': '=='}]}\n",
+    "    return filters\n",
+    "\n",
+    "\n",
+    "def get_answer_query(question, answer):\n",
+    "    return question + \" \" + answer\n",
+    "\n",
+    "def get_responses(retriever_pipeline):\n",
+    "    responses = []\n",
+    "    for ind, (question, answer, ground_truth_filename) in enumerate(zip(QUESTIONS, ANSWERS, GROUND_TRUTH_FILENAMES)):\n",
+    "\n",
+    "        filters = get_filters_dict(ground_truth_filename)\n",
+    "        response_with_answer = retriever_pipeline.run({\"query_embedder\": {\"text\": get_answer_query(question, answer)},\n",
+    "                                                             \"retriever\": {\"filters\": filters}})\n",
+    "        \n",
+    "        response_without_answer = retriever_pipeline.run({\"query_embedder\": {\"text\": question},\n",
+    "                                                             \"retriever\": {\"filters\": filters}})\n",
+    "\n",
+    "        combined_docs = []\n",
+    "        doc_ids = set([])\n",
+    "        for doc in response_with_answer['retriever']['documents']:\n",
+    "            if not doc.id in doc_ids:\n",
+    "                doc_ids.add(doc.id)\n",
+    "                combined_docs.append(doc)\n",
+    "\n",
+    "        for doc in response_without_answer['retriever']['documents']:\n",
+    "            if not doc.id in doc_ids:\n",
+    "                doc_ids.add(doc.id)\n",
+    "                combined_docs.append(doc)\n",
+    "            \n",
+    "        responses.append(combined_docs)\n",
+    "    return responses\n",
+    "\n",
+    "def get_responses_bm25(retriever):\n",
+    "    responses = []\n",
+    "    for ind, (question, answer, ground_truth_filename) in enumerate(zip(QUESTIONS, ANSWERS, GROUND_TRUTH_FILENAMES)):\n",
+    "\n",
+    "        filters = get_filters_dict(ground_truth_filename)\n",
+    "        response_with_answer = retriever.run(query=get_answer_query(question, answer), filters=filters)\n",
+    "        response_without_answer = retriever.run(query=question, filters=filters)\n",
+    "        \n",
+    "        combined_docs = []\n",
+    "        doc_ids = set([])\n",
+    "        for doc in response_with_answer['documents']:\n",
+    "            if not doc.id in doc_ids:\n",
+    "                doc_ids.add(doc.id)\n",
+    "                combined_docs.append(doc)\n",
+    "\n",
+    "        for doc in response_without_answer['documents']:\n",
+    "            if not doc.id in doc_ids:\n",
+    "                doc_ids.add(doc.id)\n",
+    "                combined_docs.append(doc)\n",
+    "            \n",
+    "        responses.append(combined_docs)\n",
+    "        \n",
+    "    return responses"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "af33c422-cedf-46e9-951b-0ab130eba6ea",
+   "metadata": {},
+   "source": [
+    "We run the 3 retrievers over all QA pairs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "1544bf3a-f299-4709-b753-e6cca6a19957",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(id=769942adbb20e0177327a764835dd625852faee2d62cefd0bafe61714a7fb89e, content: 'Published as a conference paper at ICLR 2021\n",
+       "MEASURING MASSIVE MULTITASK\n",
+       "LANGUAGE UNDERSTANDING\n",
+       "Dan ...', meta: {'file_path': '/Users/maria/Developer/haystack-evaluation/datasets/ARAGOG/papers_for_questions/MMLU_measure.pdf', 'source_id': '102c5853f251e4158b8acc51d566c5e82e9ac8437952bc02d13f0225d7581063', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': '965b199619695ee640c80b6e44ec3a2443834f60b1111522bc18d569acc47bcf', 'range': (0, 112)}]}, embedding: vector of size 768)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc_store_msmarco.filter_documents()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "6c52adfa-8496-4746-91ec-cfa14a0e5d06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "responses_msmarco = get_responses(retriever_msmarco_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "0b585f48-4872-4698-a9e7-9ce1ebddcb01",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "107"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(responses_msmarco)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "766609f4-84b7-4f2a-acfc-a79e109800d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(id=ea14f3061ed516b1ffddd6b9c5f0176755797ed1cc46756c19adc7142b923c49, content: '\n",
+       " • We show that pre-trained representations reduce\n",
+       " the need for many heavily-engineered task-\n",
+       " speciﬁ...', meta: {'file_path': '/Users/maria/Developer/haystack-evaluation/datasets/ARAGOG/papers_for_questions/bert.pdf', 'source_id': '416e31eda451ea7ae9495e88fdc271e6ef2c17fe5bf09a89c0bca54efe029bc3', 'page_number': 2, 'split_id': 24, 'split_idx_start': 4824, '_split_overlap': [{'doc_id': 'b48a07c84d9048e9b2ee23d63f2aa445d02dee758a75e4d1941c7e6d38c2ddc7', 'range': (148, 263)}, {'doc_id': '1152da2a6dea5a4a994a0f18e45a3fc30dc9ef9cd41e14a71c6420cf88e14de2', 'range': (0, 58)}]}, score: 155.0593423059841),\n",
+       " Document(id=55137fcdaaac7cd1c82eb33c64ce609d6f4915a2a037f2f9d72aa015a45dcc8f, content: ', 2018), BERT is designed to pre-\n",
+       " train deep bidirectional representations from\n",
+       " unlabeled text by jo...', meta: {'file_path': '/Users/maria/Developer/haystack-evaluation/datasets/ARAGOG/papers_for_questions/bert.pdf', 'source_id': '416e31eda451ea7ae9495e88fdc271e6ef2c17fe5bf09a89c0bca54efe029bc3', 'page_number': 1, 'split_id': 2, 'split_idx_start': 450, '_split_overlap': [{'doc_id': '96a845398eb0ec57e9192f85f8319dc3f99de9a494b1739c3ba498ba0cbede12', 'range': (87, 250)}, {'doc_id': '088faaadba88cdb2985ac63bc02583bfb569c5b5bdbabbb135512de527434128', 'range': (0, 53)}]}, score: 154.3663108793483),\n",
+       " Document(id=e9e9790b3b565014d20224db502c430977d4d97b05afaf8d85210b31f35c962a, content: '\n",
+       " 5.1 Effect of Pre-training Tasks\n",
+       " We demonstrate the importance of the deep bidi-\n",
+       " rectionality of BE...', meta: {'file_path': '/Users/maria/Developer/haystack-evaluation/datasets/ARAGOG/papers_for_questions/bert.pdf', 'source_id': '416e31eda451ea7ae9495e88fdc271e6ef2c17fe5bf09a89c0bca54efe029bc3', 'page_number': 8, 'split_id': 242, 'split_idx_start': 29809, '_split_overlap': [{'doc_id': '3889b4cf98c3da8837cb8834b5b366650f5d2e18ca5c2cf30b51ee7165c53069', 'range': (147, 149)}, {'doc_id': '353f1f715438555e393cc944e6f3aaf9370c0868cf537b8f6fe2da9c7a18d960', 'range': (0, 118)}]}, score: 144.93291581064952),\n",
+       " Document(id=1152da2a6dea5a4a994a0f18e45a3fc30dc9ef9cd41e14a71c6420cf88e14de2, content: '\n",
+       " • BERT advances the state of the art for eleven\n",
+       " NLP tasks. The code and pre-trained mod-\n",
+       " els are av...', meta: {'file_path': '/Users/maria/Developer/haystack-evaluation/datasets/ARAGOG/papers_for_questions/bert.pdf', 'source_id': '416e31eda451ea7ae9495e88fdc271e6ef2c17fe5bf09a89c0bca54efe029bc3', 'page_number': 2, 'split_id': 25, 'split_idx_start': 5146, '_split_overlap': [{'doc_id': 'ea14f3061ed516b1ffddd6b9c5f0176755797ed1cc46756c19adc7142b923c49', 'range': (321, 379)}, {'doc_id': '421aad63c45e54b5d934420d70166435dff314aa392756a5222ef9ef4eaade3c', 'range': (0, 26)}]}, score: 176.87809502980008)]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "responses_msmarco[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "53506498-3c5d-4860-ae47-77affecd1194",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n• We show that pre-trained representations reduce\\nthe need for many heavily-engineered task-\\nspeciﬁc architectures. BERT is the ﬁrst ﬁne-\\ntuning based representation model that achieves\\nstate-of-the-art performance on a large suite\\nof sentence-level andtoken-level tasks, outper-\\nforming many task-speciﬁc architectures.\\n• BERT advances the state of the art for eleven\\nNLP tasks.'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "responses_msmarco[0][0].content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "68f257ca-349c-492c-91f8-befc9059e76c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "responses_mnet = get_responses(retriever_mnet_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "3d7e5dda-59ac-4af5-bec1-bf1e33f74272",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'6 84.9\\nTable 5: Ablation over the pre-training tasks using the\\nBERT BASE architecture. “No NSP” is trained without\\nthe next sentence prediction task.'"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "responses_mnet[0][0].content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "2c3db83a-f4bb-4d93-ab1c-6e87fe5bcb66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "responses_bm25 = get_responses_bm25(retriever_bm25)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "1476e5a6-7bb5-40bf-a481-ecfb51c7dc2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n5.1 Effect of Pre-training Tasks\\nWe demonstrate the importance of the deep bidi-\\nrectionality of BERT by evaluating two pre-\\ntraining objectives using exactly the same pre-\\ntraining data, ﬁne-tuning scheme, and hyperpa-\\nrameters as BERT BASE :\\nNo NSP : A bidirectional model which is trained\\nusing the “masked LM” (MLM) but without the\\n“next sentence prediction” (NSP) task.\\nLTR & No NSP : A left-context-only model which\\nis trained using a standard Left-to-Right (LTR)\\nLM, rather than an MLM.'"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "responses_bm25[0][0].content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed11559f-5d6c-4118-a127-631d30e3b15a",
+   "metadata": {},
+   "source": [
+    "Now we will create a final list of candidate relevant documents by adding all the documents retrieved above and making sure the list does not contain any duplicates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "fa481f71-24f0-4448-b7df-48f4f23ce593",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "candidate_docs = []\n",
+    "\n",
+    "for ind, (question, answer, ground_truth_filename) in enumerate(zip(QUESTIONS, ANSWERS, GROUND_TRUTH_FILENAMES)):\n",
+    "    \n",
+    "    resp_msmarco = responses_msmarco[ind]\n",
+    "    resp_mnet = responses_mnet[ind]\n",
+    "    resp_bm25 = responses_bm25[ind]\n",
+    "\n",
+    "    new_row = {\"question\": question,\n",
+    "              \"answer\": answer,\n",
+    "              \"filename\": ground_truth_filename}\n",
+    "\n",
+    "    doc_ids = set([])\n",
+    "    new_docs = []\n",
+    "    \n",
+    "    for doc in resp_msmarco:\n",
+    "        doc_ids.add(doc.id)\n",
+    "        new_docs.append({\"content\": doc.content, \n",
+    "                         \"score\": doc.score, \n",
+    "                         \"method\": \"embedding msmarco\"})\n",
+    "\n",
+    "    for doc in resp_mnet:\n",
+    "        if not doc.id in doc_ids:\n",
+    "            doc_ids.add(doc.id)\n",
+    "            new_docs.append({\"content\": doc.content, \n",
+    "                             \"score\": doc.score, \n",
+    "                             \"method\": \"embedding mnet\"})\n",
+    "\n",
+    "    for doc in resp_bm25:\n",
+    "        if not doc.id in doc_ids:\n",
+    "            doc_ids.add(doc.id)\n",
+    "            new_docs.append({\"content\": doc.content, \n",
+    "                             \"score\": doc.score, \n",
+    "                             \"method\": \"BM25\"})\n",
+    "\n",
+    "    new_row['docs'] = new_docs\n",
+    "    candidate_docs.append(new_row)        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "40a5f470-9459-46e0-976a-d3b53dcbefd9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'What are the two main tasks BERT is pre-trained on?',\n",
+       " 'answer': 'Masked LM (MLM) and Next Sentence Prediction (NSP).',\n",
+       " 'filename': 'bert.pdf',\n",
+       " 'docs': [{'content': '\\n• We show that pre-trained representations reduce\\nthe need for many heavily-engineered task-\\nspeciﬁc architectures. BERT is the ﬁrst ﬁne-\\ntuning based representation model that achieves\\nstate-of-the-art performance on a large suite\\nof sentence-level andtoken-level tasks, outper-\\nforming many task-speciﬁc architectures.\\n• BERT advances the state of the art for eleven\\nNLP tasks.',\n",
+       "   'score': 155.0593423059841,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': ', 2018), BERT is designed to pre-\\ntrain deep bidirectional representations from\\nunlabeled text by jointly conditioning on both\\nleft and right context in all layers. As a re-\\nsult, the pre-trained BERT model can be ﬁne-\\ntuned with just one additional output layer\\nto create state-of-the-art models for a wide\\nrange of tasks, such as question answering and\\nlanguage inference, without substantial task-\\nspeciﬁc architecture modiﬁcations.\\nBERT is conceptually simple and empirically\\npowerful.',\n",
+       "   'score': 154.3663108793483,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': '\\n5.1 Effect of Pre-training Tasks\\nWe demonstrate the importance of the deep bidi-\\nrectionality of BERT by evaluating two pre-\\ntraining objectives using exactly the same pre-\\ntraining data, ﬁne-tuning scheme, and hyperpa-\\nrameters as BERT BASE :\\nNo NSP : A bidirectional model which is trained\\nusing the “masked LM” (MLM) but without the\\n“next sentence prediction” (NSP) task.\\nLTR & No NSP : A left-context-only model which\\nis trained using a standard Left-to-Right (LTR)\\nLM, rather than an MLM.',\n",
+       "   'score': 144.93291581064952,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': '\\n• BERT advances the state of the art for eleven\\nNLP tasks. The code and pre-trained mod-\\nels are available at https://github.com/\\ngoogle-research/bert .',\n",
+       "   'score': 176.87809502980008,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': '6 84.9\\nTable 5: Ablation over the pre-training tasks using the\\nBERT BASE architecture. “No NSP” is trained without\\nthe next sentence prediction task.',\n",
+       "   'score': 0.7463918929260586,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': '. TokM Masked Sentence A Masked Sentence B Pre-training Fine-Tuning NSP Mask LM Mask LM Unlabeled Sentence A and B Pair SQuAD Question Answer Pair NER MNLI Figure 1: Overall pre-training and ﬁne-tuning procedures for BERT. Apart from output layers, the same architec-\\ntures are used in both pre-training and ﬁne-tuning.',\n",
+       "   'score': 0.7198837425155267,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': ' There are two steps in our\\nframework: pre-training and ﬁne-tuning . Dur-\\ning pre-training, the model is trained on unlabeled\\ndata over different pre-training tasks. For ﬁne-\\ntuning, the BERT model is ﬁrst initialized with\\nthe pre-trained parameters, and all of the param-\\neters are ﬁne-tuned using labeled data from the\\ndownstream tasks.',\n",
+       "   'score': 0.7006729499399962,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': ' The\\nquestion-answering example in Figure 1 will serve\\nas a running example for this section.\\nA distinctive feature of BERT is its uniﬁed ar-\\nchitecture across different tasks. There is mini-mal difference between the pre-trained architec-\\nture and the ﬁnal downstream architecture.',\n",
+       "   'score': 0.6843038818418509,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': '..Sentence 2 Figure 4: Illustrations of Fine-tuning BERT on Different Tasks.',\n",
+       "   'score': 0.6726766349368809,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': ' “No NSP” is trained without\\nthe next sentence prediction task. “LTR & No NSP” is\\ntrained as a left-to-right LM without the next sentence\\nprediction, like OpenAI GPT. “+ BiLSTM” adds a ran-\\ndomly initialized BiLSTM on top of the “LTR + No\\nNSP” model during ﬁne-tuning.',\n",
+       "   'score': 65.47671163497554,\n",
+       "   'method': 'BM25'}]}"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "candidate_docs[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "9d66c63b-7f46-4f52-a567-b4a20e1ae6a0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'What model sizes are reported for BERT, and what are their specifications?',\n",
+       " 'answer': 'BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M).',\n",
+       " 'filename': 'bert.pdf',\n",
+       " 'docs': [{'content': '3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).\\nBERT BASE was chosen to have the same model\\nsize as OpenAI GPT for comparison purposes.\\nCritically, however, the BERT Transformer uses\\nbidirectional self-attention, while the GPT Trans-\\nformer uses constrained self-attention where every\\ntoken can only attend to context to its left.',\n",
+       "   'score': 194.82163561646738,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': ', 2018). By contrast, BERT BASE\\ncontains 110M parameters and BERT LARGE con-\\ntains 340M parameters.\\nIt has long been known that increasing the\\nmodel size will lead to continual improvements\\non large-scale tasks such as machine translation\\nand language modeling, which is demonstrated\\nby the LM perplexity of held-out training data\\nshown in Table 6.',\n",
+       "   'score': 181.8022687235342,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': 'e., Transformer blocks) as L, the hidden size as\\nH, and the number of self-attention heads as A.3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).',\n",
+       "   'score': 181.50001496180622,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': '8 93.7\\nTable 6: Ablation over BERT model size. #L = the\\nnumber of layers; #H = hidden size; #A = number of at-\\ntention heads.',\n",
+       "   'score': 0.6541984736828388,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': '\\nTraining of BERT BASE was performed on 4\\nCloud TPUs in Pod conﬁguration (16 TPU chips\\ntotal).13Training of BERT LARGE was performed\\non 16 Cloud TPUs (64 TPU chips total). Each pre-\\ntraining took 4 days to complete.',\n",
+       "   'score': 0.5342425096311341,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': '9 93.3\\n24 1024 16 3.23 86.',\n",
+       "   'score': 73.56936802252953,\n",
+       "   'method': 'BM25'},\n",
+       "  {'content': ' F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and\\naccuracy scores are reported for the other tasks. We exclude entries that use BERT as one of their components.\\nWe use a batch size of 32 and ﬁne-tune for 3\\nepochs over the data for all GLUE tasks.',\n",
+       "   'score': 29.533500736317514,\n",
+       "   'method': 'BM25'},\n",
+       "  {'content': ' The “Average” column is slightly different\\nthan the ofﬁcial GLUE score, since we exclude the problematic WNLI set.8BERT and OpenAI GPT are single-\\nmodel, single task. F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and\\naccuracy scores are reported for the other tasks.',\n",
+       "   'score': 27.849829888401317,\n",
+       "   'method': 'BM25'},\n",
+       "  {'content': ' Note that\\nBERT BASE and OpenAI GPT are nearly identical\\nin terms of model architecture apart from the at-\\ntention masking. For the largest and most widely\\nreported GLUE task, MNLI, BERT obtains a 4.6%\\nabsolute accuracy improvement.',\n",
+       "   'score': 27.043483442059646,\n",
+       "   'method': 'BM25'}]}"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "candidate_docs[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "ec702fd8-630a-4ff8-9dbf-afb5207c8439",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': \"How does BERT's architecture facilitate the use of a unified model across diverse NLP tasks?\",\n",
+       " 'answer': 'BERT uses a multi-layer bidirectional Transformer encoder architecture, allowing for minimal task-specific architecture modifications in fine-tuning.',\n",
+       " 'filename': 'bert.pdf',\n",
+       " 'docs': [{'content': '\\n• We show that pre-trained representations reduce\\nthe need for many heavily-engineered task-\\nspeciﬁc architectures. BERT is the ﬁrst ﬁne-\\ntuning based representation model that achieves\\nstate-of-the-art performance on a large suite\\nof sentence-level andtoken-level tasks, outper-\\nforming many task-speciﬁc architectures.\\n• BERT advances the state of the art for eleven\\nNLP tasks.',\n",
+       "   'score': 178.45485510132914,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': ', 2018), BERT is designed to pre-\\ntrain deep bidirectional representations from\\nunlabeled text by jointly conditioning on both\\nleft and right context in all layers. As a re-\\nsult, the pre-trained BERT model can be ﬁne-\\ntuned with just one additional output layer\\nto create state-of-the-art models for a wide\\nrange of tasks, such as question answering and\\nlanguage inference, without substantial task-\\nspeciﬁc architecture modiﬁcations.\\nBERT is conceptually simple and empirically\\npowerful.',\n",
+       "   'score': 176.38216358887786,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': 'Figure 3: Differences in pre-training model architectures. BERT uses a bidirectional Transformer. OpenAI GPT\\nuses a left-to-right Transformer.',\n",
+       "   'score': 158.36488690249536,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': ' Among the three, only BERT representations are jointly\\nconditioned on both left and right context in all layers. In addition to the architecture differences, BERT and\\nOpenAI GPT are ﬁne-tuning approaches, while ELMo is a feature-based approach.\\nto converge.',\n",
+       "   'score': 155.35658691966438,\n",
+       "   'method': 'embedding msmarco'},\n",
+       "  {'content': '\\n3.2 Fine-tuning BERT\\nFine-tuning is straightforward since the self-\\nattention mechanism in the Transformer al-\\nlows BERT to model many downstream tasks—\\nwhether they involve single text or text pairs—by\\nswapping out the appropriate inputs and outputs.\\nFor applications involving text pairs, a common\\npattern is to independently encode text pairs be-\\nfore applying bidirectional cross attention, such\\nas Parikh et al.',\n",
+       "   'score': 0.7990849344747063,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': '\\nBERT is conceptually simple and empirically\\npowerful. It obtains new state-of-the-art re-\\nsults on eleven natural language processing\\ntasks, including pushing the GLUE score to\\n80.5% (7.',\n",
+       "   'score': 0.7801923416931045,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': '\\nWe ﬁnd that BERT LARGE signiﬁcantly outper-\\nforms BERT BASE across all tasks, especially those\\nwith very little training data. The effect of model\\nsize is explored more thoroughly in Section 5.2.',\n",
+       "   'score': 0.7740517896904493,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': ' The\\nquestion-answering example in Figure 1 will serve\\nas a running example for this section.\\nA distinctive feature of BERT is its uniﬁed ar-\\nchitecture across different tasks. There is mini-mal difference between the pre-trained architec-\\nture and the ﬁnal downstream architecture.',\n",
+       "   'score': 0.7369758374601934,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': ' More details can be\\nfound in Appendix A.5.\\n4 Experiments\\nIn this section, we present BERT ﬁne-tuning re-\\nsults on 11 NLP tasks.',\n",
+       "   'score': 0.7109287382628762,\n",
+       "   'method': 'embedding mnet'},\n",
+       "  {'content': '\\nA.5 Illustrations of Fine-tuning on Different\\nTasks\\nThe illustration of ﬁne-tuning BERT on different\\ntasks can be seen in Figure 4. Our task-speciﬁc\\nmodels are formed by incorporating BERT with\\none additional output layer, so a minimal num-\\nber of parameters need to be learned from scratch.',\n",
+       "   'score': 92.57466017356477,\n",
+       "   'method': 'BM25'},\n",
+       "  {'content': ' There is mini-mal difference between the pre-trained architec-\\nture and the ﬁnal downstream architecture.\\nModel Architecture BERT’s model architec-\\nture is a multi-layer bidirectional Transformer en-\\ncoder based on the original implementation de-\\nscribed in Vaswani et al. (2017) and released in\\nthetensor2tensor library.',\n",
+       "   'score': 91.1009522029178,\n",
+       "   'method': 'BM25'},\n",
+       "  {'content': '\\n• BERT advances the state of the art for eleven\\nNLP tasks. The code and pre-trained mod-\\nels are available at https://github.com/\\ngoogle-research/bert .',\n",
+       "   'score': 44.01610773365283,\n",
+       "   'method': 'BM25'}]}"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "candidate_docs[2]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ac85595-27c7-42c6-a182-18048dc64469",
+   "metadata": {},
+   "source": [
+    "We extract the questions, answers and contexts (content of the documents) from the list of candidate documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "b3de1ea0-22fa-4b4d-9981-0e814c99baed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "questions = [row['question'] for row in candidate_docs]\n",
+    "answers = [row['answer'] for row in candidate_docs]\n",
+    "contexts = [row['docs'] for row in candidate_docs]\n",
+    "contexts_contents = [[doc['content'] for doc in docs] for docs in contexts]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4720e9aa-3106-4304-8f2a-b59aee1400a1",
+   "metadata": {},
+   "source": [
+    "# Score the documents with a LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b566bab-6d67-403d-8acc-9e78274b7cbc",
+   "metadata": {},
+   "source": [
+    "In this section, we will go through the list of candidate documents and score them with a LLM. To this end, we create a custom metric `SingleContextRelevanceEvaluator` that inherits from the `LLMEvaluator` Haystack class. This metric will label every document using the prompt below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "59cb7895-1a59-40aa-8bc5-ef036285b874",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PROMPT_TEMPLATE = \"\"\"\n",
+    "Your task is to judge how relevant the provided context is for answering a question. We will provide both the question and the true answer.\n",
+    "\n",
+    "Please return one of the following categories:\n",
+    "- full: if the provided context has all the information in the answer to address the question entirely,\n",
+    "- partial: if the provided context has some information in the answer necessary to address the question,\n",
+    "- no: if the provided context has no information absolutely necessary to answer the question. \n",
+    "\n",
+    "The response should be in json format and one of the following:\n",
+    "{\"response\": \"full\"} or {\"response\": \"partial\"} or {\"response\": \"no\"}\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c40c080f-9628-4e97-93b2-c163ed2a8464",
+   "metadata": {},
+   "source": [
+    "Note that the `LLMEvaluator` class will add the question and answer to the prompt before passing to the LLM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "e4b61c40-626c-4838-895b-8c0c08f9a4cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@component\n",
+    "class SingleContextRelevanceEvaluator(LLMEvaluator):\n",
+    "    \"\"\"\n",
+    "    Evaluator that checks if a provided context is relevant to the question.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "            self,\n",
+    "            examples: Optional[List[Dict[str, Any]]] = None,\n",
+    "            progress_bar: bool = True,\n",
+    "            api: str = \"openai\",\n",
+    "            api_key: Secret = Secret.from_env_var(\"OPENAI_API_KEY\"),\n",
+    "            raise_on_failure: bool = True,\n",
+    "    ):\n",
+    "        \"\"\"\n",
+    "        Creates an instance of ContextRelevanceEvaluator.\n",
+    "        \"\"\"\n",
+    "        self.instructions = PROMPT_TEMPLATE\n",
+    "        self.inputs = [(\"questions\", List[str]), (\"contexts\", List[List[str]]), (\"answers\", List[str])]\n",
+    "        self.outputs = [\"response\"]\n",
+    "        self.examples = [{\"inputs\": {}, \"outputs\": {}}]\n",
+    "        self.api = api\n",
+    "        self.api_key = api_key\n",
+    "\n",
+    "        super(SingleContextRelevanceEvaluator, self).__init__(\n",
+    "            instructions=self.instructions,\n",
+    "            inputs=self.inputs,\n",
+    "            outputs=self.outputs,\n",
+    "            examples=self.examples,\n",
+    "            api=self.api,\n",
+    "            api_key=self.api_key,\n",
+    "            raise_on_failure=raise_on_failure,\n",
+    "            progress_bar=progress_bar,\n",
+    "        )\n",
+    "\n",
+    "    @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])\n",
+    "    def run(self, questions: List[str],\n",
+    "            contexts: List[List[str]],\n",
+    "            answers: List[str]) -> Dict[str, Any]:\n",
+    "        \"\"\"\n",
+    "        Run the LLM evaluator on the contexts.\n",
+    "        \"\"\"\n",
+    "        all_results = []\n",
+    "        for question, answer, retrieved_contexts in zip(questions, answers, contexts):\n",
+    "            question_results = []\n",
+    "            for retrieved_context in retrieved_contexts:\n",
+    "                result = super(SingleContextRelevanceEvaluator, self).run(questions=[question],\n",
+    "                                                                        contexts=[retrieved_context],\n",
+    "                                                                        answers=[answer])\n",
+    "                try:\n",
+    "                    answer = result['results'][0][\"response\"]\n",
+    "                except:\n",
+    "                    answer = \"error\"\n",
+    "                question_results.append(answer)\n",
+    "            all_results.append(question_results)\n",
+    "\n",
+    "        return {\"results\": all_results}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2839e5d-cfdc-44af-a354-0a5e00787c27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluator = SingleContextRelevanceEvaluator(raise_on_failure=False)\n",
+    "\n",
+    "evaluator.generator = OpenAIGenerator(\n",
+    "    model=\"gpt-4-turbo\",\n",
+    "    generation_kwargs={\"response_format\": {\"type\": \"json_object\"}, \"seed\": 42},\n",
+    "\n",
+    ")\n",
+    "\n",
+    "results = evaluator.run(questions=questions, \n",
+    "                        contexts=contexts_contents, \n",
+    "                        answers=answers)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a69d601-0e7e-40e2-be28-6230e27b2dac",
+   "metadata": {},
+   "source": [
+    "Process the list of candidate documents by adding the LLM labels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "d38d5bb2-6372-4e89-9450-4108e9525447",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "candidate_docs_with_LLM_scores = []\n",
+    "\n",
+    "for dataset_row, row_context_results in zip(candidate_docs, results['results']):\n",
+    "    new_row = dataset_row.copy()\n",
+    "    contexts_with_scores = []\n",
+    "    for context, context_score in zip(dataset_row['docs'], row_context_results):\n",
+    "        context['LLM_judgment'] = context_score\n",
+    "        contexts_with_scores.append(context)\n",
+    "    new_row['docs'] = contexts_with_scores\n",
+    "    candidate_docs_with_LLM_scores.append(new_row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "1997d497-e7eb-4ffd-8adf-195730f2fb6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'What model sizes are reported for BERT, and what are their specifications?',\n",
+       " 'answer': 'BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M).',\n",
+       " 'filename': 'bert.pdf',\n",
+       " 'docs': [{'content': '3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).\\nBERT BASE was chosen to have the same model\\nsize as OpenAI GPT for comparison purposes.\\nCritically, however, the BERT Transformer uses\\nbidirectional self-attention, while the GPT Trans-\\nformer uses constrained self-attention where every\\ntoken can only attend to context to its left.',\n",
+       "   'score': 194.82163561646738,\n",
+       "   'method': 'embedding msmarco',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': ', 2018). By contrast, BERT BASE\\ncontains 110M parameters and BERT LARGE con-\\ntains 340M parameters.\\nIt has long been known that increasing the\\nmodel size will lead to continual improvements\\non large-scale tasks such as machine translation\\nand language modeling, which is demonstrated\\nby the LM perplexity of held-out training data\\nshown in Table 6.',\n",
+       "   'score': 181.8022687235342,\n",
+       "   'method': 'embedding msmarco',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': 'e., Transformer blocks) as L, the hidden size as\\nH, and the number of self-attention heads as A.3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).',\n",
+       "   'score': 181.50001496180622,\n",
+       "   'method': 'embedding msmarco',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': '8 93.7\\nTable 6: Ablation over BERT model size. #L = the\\nnumber of layers; #H = hidden size; #A = number of at-\\ntention heads.',\n",
+       "   'score': 0.6541984736828388,\n",
+       "   'method': 'embedding mnet',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': '\\nTraining of BERT BASE was performed on 4\\nCloud TPUs in Pod conﬁguration (16 TPU chips\\ntotal).13Training of BERT LARGE was performed\\non 16 Cloud TPUs (64 TPU chips total). Each pre-\\ntraining took 4 days to complete.',\n",
+       "   'score': 0.5342425096311341,\n",
+       "   'method': 'embedding mnet',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': '9 93.3\\n24 1024 16 3.23 86.',\n",
+       "   'score': 73.56936802252953,\n",
+       "   'method': 'BM25',\n",
+       "   'LLM_judgment': 'no'},\n",
+       "  {'content': ' F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and\\naccuracy scores are reported for the other tasks. We exclude entries that use BERT as one of their components.\\nWe use a batch size of 32 and ﬁne-tune for 3\\nepochs over the data for all GLUE tasks.',\n",
+       "   'score': 29.533500736317514,\n",
+       "   'method': 'BM25',\n",
+       "   'LLM_judgment': 'no'},\n",
+       "  {'content': ' The “Average” column is slightly different\\nthan the ofﬁcial GLUE score, since we exclude the problematic WNLI set.8BERT and OpenAI GPT are single-\\nmodel, single task. F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and\\naccuracy scores are reported for the other tasks.',\n",
+       "   'score': 27.849829888401317,\n",
+       "   'method': 'BM25',\n",
+       "   'LLM_judgment': 'no'},\n",
+       "  {'content': ' Note that\\nBERT BASE and OpenAI GPT are nearly identical\\nin terms of model architecture apart from the at-\\ntention masking. For the largest and most widely\\nreported GLUE task, MNLI, BERT obtains a 4.6%\\nabsolute accuracy improvement.',\n",
+       "   'score': 27.043483442059646,\n",
+       "   'method': 'BM25',\n",
+       "   'LLM_judgment': 'no'}]}"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "candidate_docs_with_LLM_scores[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e00c1de3-e328-42aa-8146-891f099a1419",
+   "metadata": {},
+   "source": [
+    "## Clean & save the dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbe26e89-094a-4298-a5e5-840f6dd2ed32",
+   "metadata": {},
+   "source": [
+    "In this section, we remove any QA pair that has no document labelled as relevant by the LLM. We also sort the documents, so that documents that have been scored as \"full\" match rank higher than the ones that have been given a \"partial\" match. This is relevant for some of the retrieval metrics later used."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "2fa5d4dd-327a-41d2-997c-bad577cdcc2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prioritise_scores(score):\n",
+    "    if score == \"full\":\n",
+    "        return 0\n",
+    "    if score == \"partial\":\n",
+    "        return 1\n",
+    "    return 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "6fdc1702-1792-4295-b430-78e40996cda0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_dataset = []\n",
+    "for row in candidate_docs_with_LLM_scores:\n",
+    "    new_row = row.copy()\n",
+    "    docs = row['docs']\n",
+    "    filtered_docs = [doc for doc in docs if doc['LLM_judgment'] in [\"full\", \"partial\"]]\n",
+    "    sorted_docs = sorted(filtered_docs, key=lambda x: prioritise_scores(x['LLM_judgment']))\n",
+    "    if len(sorted_docs):\n",
+    "        new_row['docs'] = sorted_docs\n",
+    "        final_dataset.append(new_row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "39f8c11c-994b-4ba0-8ec6-1e1325b4bc9e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'What model sizes are reported for BERT, and what are their specifications?',\n",
+       " 'answer': 'BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M).',\n",
+       " 'filename': 'bert.pdf',\n",
+       " 'docs': [{'content': '3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).\\nBERT BASE was chosen to have the same model\\nsize as OpenAI GPT for comparison purposes.\\nCritically, however, the BERT Transformer uses\\nbidirectional self-attention, while the GPT Trans-\\nformer uses constrained self-attention where every\\ntoken can only attend to context to its left.',\n",
+       "   'score': 194.82163561646738,\n",
+       "   'method': 'embedding msmarco',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': ', 2018). By contrast, BERT BASE\\ncontains 110M parameters and BERT LARGE con-\\ntains 340M parameters.\\nIt has long been known that increasing the\\nmodel size will lead to continual improvements\\non large-scale tasks such as machine translation\\nand language modeling, which is demonstrated\\nby the LM perplexity of held-out training data\\nshown in Table 6.',\n",
+       "   'score': 181.8022687235342,\n",
+       "   'method': 'embedding msmarco',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': 'e., Transformer blocks) as L, the hidden size as\\nH, and the number of self-attention heads as A.3\\nWe primarily report results on two model sizes:\\nBERT BASE (L=12, H=768, A=12, Total Param-\\neters=110M) and BERT LARGE (L=24, H=1024,\\nA=16, Total Parameters=340M).',\n",
+       "   'score': 181.50001496180622,\n",
+       "   'method': 'embedding msmarco',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': '8 93.7\\nTable 6: Ablation over BERT model size. #L = the\\nnumber of layers; #H = hidden size; #A = number of at-\\ntention heads.',\n",
+       "   'score': 0.6541984736828388,\n",
+       "   'method': 'embedding mnet',\n",
+       "   'LLM_judgment': 'full'},\n",
+       "  {'content': '\\nTraining of BERT BASE was performed on 4\\nCloud TPUs in Pod conﬁguration (16 TPU chips\\ntotal).13Training of BERT LARGE was performed\\non 16 Cloud TPUs (64 TPU chips total). Each pre-\\ntraining took 4 days to complete.',\n",
+       "   'score': 0.5342425096311341,\n",
+       "   'method': 'embedding mnet',\n",
+       "   'LLM_judgment': 'full'}]}"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "final_dataset[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "9ea67a2a-eaa3-4363-8a8d-473ba0627e65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# json.dump(final_dataset, open(\"final_dataset.json\", \"w\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}