diff --git a/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb b/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb index 45141cca..04d5b4cc 100644 --- a/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb +++ b/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb @@ -54,7 +54,7 @@ "metadata": {}, "outputs": [], "source": [ - "# pip install -r requirements-exra.txt" + "# pip install -r requirements-extra.txt" ] }, { @@ -107,31 +107,28 @@ "id": "89343a84-0ddc-42fc-bf50-298a342b93c0", "metadata": {}, "source": [ - "- First, we need to provide our OpenAI API key, which can be found at https://platform.openai.com/api-keys\n", - "- Make sure not to share this key with anyone (make sure to delete it from this notebook in case you intend to share it; I recommend deleting the entire notebook cell that contains the key)\n", - "- Alternatively, delete the used API key from your account after you are finished to make sure it can't be abused later" + "- First, we need to provide our OpenAI API secret key, which can be found at https://platform.openai.com/api-keys\n", + "- Make sure not to share this key with anyone\n", + "- Add this secret key (`\"sk-...\"`) to the `config.json` file in this folder" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8ba8760c-1635-43cf-b039-9d1557b664c4", - "metadata": {}, - "outputs": [], - "source": [ - "OPENAI_API_KEY = \"your OpenAI API key\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "26900564-aba7-48ba-8ee8-6cc9a505a25c", "metadata": {}, "outputs": [], "source": [ + "import json\n", "from openai import OpenAI\n", "\n", - "client = OpenAI(api_key=OPENAI_API_KEY)" + "# Load API key from a JSON file. \n", + "# Make sure to replace \"sk-...\" with your actual API key from https://platform.openai.com/api-keys\n", + "with open(\"config.json\", \"r\") as config_file:\n", + " config = json.load(config_file)\n", + " api_key = config[\"OPENAI_API_KEY\"]\n", + "\n", + "client = OpenAI(api_key=api_key)" ] }, { @@ -144,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "08e9ef2e-e816-4283-840e-43625791ad33", "metadata": {}, "outputs": [ @@ -154,7 +151,7 @@ "'Breakfast was eaten by me.'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -193,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "8b2d393a-aa92-4190-9d44-44326a6f699b", "metadata": {}, "outputs": [ @@ -226,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "735cc089-d127-480a-b39d-0782581f0c41", "metadata": {}, "outputs": [ @@ -255,7 +252,7 @@ ">> The three primary colors are red, blue, and yellow.\n", "\n", "Output:\n", - ">> Red, blue, and yellow are the three primary colors.\n", + ">> Red, blue, and yellow are considered the three primary colors.\n", "\n", "-------------------------\n", "\n", @@ -299,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "4f700d4b-19e5-4404-afa7-b0f093024232", "metadata": {}, "outputs": [ @@ -307,7 +304,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|█████████████████████████████████████████████| 5/5 [00:05<00:00, 1.12s/it]\n" + "100%|██████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00, 1.23it/s]\n" ] } ], @@ -331,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "5b6eaa87-a86d-42a1-a20a-b764b0d559d4", "metadata": {}, "outputs": [ @@ -344,7 +341,7 @@ " 'output_2': 'The sentence is \"sleeps.\"'}" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -363,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "eef99407-8ffd-4a63-b7ab-ffe30c0f0677", "metadata": {}, "outputs": [ @@ -371,7 +368,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|█████████████████████████████████████████| 200/200 [02:38<00:00, 1.26it/s]\n" + "100%|██████████████████████████████████████████████████████████████████| 200/200 [03:43<00:00, 1.12s/it]\n" ] } ], @@ -392,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "330cc30a-b08e-4bf0-bee2-bec0da4208de", "metadata": {}, "outputs": [], diff --git a/ch07/03_model-evaluation/llm-instruction-eval-openai.ipynb b/ch07/03_model-evaluation/llm-instruction-eval-openai.ipynb index 8ae44da4..a7ef0438 100644 --- a/ch07/03_model-evaluation/llm-instruction-eval-openai.ipynb +++ b/ch07/03_model-evaluation/llm-instruction-eval-openai.ipynb @@ -57,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "# pip install -r requirements-exra.txt" + "# pip install -r requirements-extra.txt" ] }, { @@ -110,31 +110,28 @@ "id": "89343a84-0ddc-42fc-bf50-298a342b93c0", "metadata": {}, "source": [ - "OPENAI_API_KEY = \"Your Open AI API Key\"- First, we need to provide our OpenAI API key, which can be found at https://platform.openai.com/api-keys\n", - "- Make sure not to share this key with anyone (make sure to delete it from this notebook in case you intend to share it; I recommend deleting the entire notebook cell that contains the key)\n", - "- Alternatively, delete the used API key from your account after you are finished to make sure it can't be abused later" + "- First, we need to provide our OpenAI API secret key, which can be found at https://platform.openai.com/api-keys\n", + "- Make sure not to share this key with anyone\n", + "- Add this secret key (`\"sk-...\"`) to the `config.json` file in this folder" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "65b0ba76-1fb1-4306-a7c2-8f3bb637ccdb", "metadata": {}, "outputs": [], "source": [ - "OPENAI_API_KEY = \"Your Open AI API Key\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "26900564-aba7-48ba-8ee8-6cc9a505a25c", - "metadata": {}, - "outputs": [], - "source": [ + "import json\n", "from openai import OpenAI\n", "\n", - "client = OpenAI(api_key=OPENAI_API_KEY)" + "# Load API key from a JSON file. \n", + "# Make sure to replace \"sk-...\" with your actual API key from https://platform.openai.com/api-keys\n", + "with open(\"config.json\", \"r\") as config_file:\n", + " config = json.load(config_file)\n", + " api_key = config[\"OPENAI_API_KEY\"]\n", + "\n", + "client = OpenAI(api_key=api_key)" ] }, { @@ -147,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "08e9ef2e-e816-4283-840e-43625791ad33", "metadata": {}, "outputs": [ @@ -157,7 +154,7 @@ "'hello world'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -195,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "8b2d393a-aa92-4190-9d44-44326a6f699b", "metadata": {}, "outputs": [ @@ -228,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "7222fdc0-5684-4f2b-b741-3e341851359e", "metadata": {}, "outputs": [ @@ -242,7 +239,7 @@ " 'model 2 response': '\\nThe hypotenuse of the triangle is 12 cm.'}" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -261,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "43263cd3-e5fb-4ab5-871e-3ad6e7d21a8c", "metadata": {}, "outputs": [], @@ -289,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "735cc089-d127-480a-b39d-0782581f0c41", "metadata": {}, "outputs": [ @@ -308,11 +305,15 @@ "Score:\n", ">> The model response \"The hypotenuse of the triangle is 3 cm.\" is incorrect. The correct calculation of the hypotenuse for a right triangle with legs of 6 cm and 8 cm should be done using the Pythagorean theorem, which states that the square of the hypotenuse (c) is equal to the sum of the squares of the other two sides (a and b). Thus, \\( c = \\sqrt{6^2 + 8^2} = \\sqrt{36 + 64} = \\sqrt{100} = 10 \\) cm.\n", "\n", - "The model response provided a hypotenuse of 3 cm, which is not only incorrect but also mathematically impossible given the lengths of the legs (since 3 cm is less than either leg of the triangle, it cannot be the hypotenuse in a right triangle with these dimensions).\n", + "The model response provides a hypotenuse of 3 cm, which is not only numerically incorrect but also logically inconsistent because the hypotenuse is the longest side of a right triangle and cannot be shorter than either of the other two sides (6 cm and 8 cm in this case).\n", + "\n", + "Given the scale from 0 to 100, where 100 is the best score:\n", + "- Accuracy: The response is completely inaccurate.\n", + "- Relevance: The response addresses the task of calculating the hypotenuse but fails to do so correctly.\n", "\n", - "Given the incorrectness and the impossibility of the response, the score would be very low. However, since the response format is correct (stating the hypotenuse is a certain measurement in cm), it does not score absolutely zero.\n", + "Score: 0\n", "\n", - "Score: 10/100. The points are given for maintaining the correct format and units in the response, but the mathematical error is significant and fundamental, leading to a low score.\n", + "The score is 0 because the response is factually incorrect and provides misleading information that does not fulfill the task as required.\n", "\n", "-------------------------\n", "\n", @@ -331,28 +332,22 @@ "6. Cobra\n", "\n", "Score:\n", - ">> To evaluate the model response against the given instruction, we need to consider the accuracy, relevance, and adherence to the instruction's requirements. The instruction specifically asks for the names of three different animals that are active during the day.\n", - "\n", - "### Analysis of Model Response:\n", - "1. **Relevance and Accuracy**: \n", - " - **Squirrel**: Correct, squirrels are diurnal (active during the day).\n", - " - **Tiger**: Correct, though tigers can be crepuscular (active during dawn and dusk), they are often active during the day as well.\n", - " - **Eagle**: Correct, eagles are generally diurnal.\n", - " - **Cobra**: Incorrect, cobras are generally not active during the day; they are more active during the early and late hours of the day, making them crepuscular.\n", + ">> The model response lists six animals, three of which are repeated, and includes animals not specifically known for being diurnal (active during the day). The instruction specifically asked for three different animals that are active during the day. Here's the breakdown:\n", "\n", - "2. **Adherence to Instruction**:\n", - " - The instruction asked for three different animals. The model response listed six items, which is double the requested amount.\n", - " - The response includes repetitions (Tiger and Cobra are each mentioned twice), which does not align with the instruction to name different animals.\n", + "1. **Squirrel** - Correct, squirrels are diurnal.\n", + "2. **Tiger** - Generally, tigers are crepuscular (active during dawn and dusk) rather than strictly diurnal, but they can be active during the day, especially in cooler weather.\n", + "3. **Eagle** - Correct, eagles are diurnal.\n", + "4. **Cobra** - Incorrect, cobras are generally not diurnal; they are more active during the evening and early morning.\n", + "5. **Tiger** - Repeated, and as noted, not strictly diurnal.\n", + "6. **Cobra** - Repeated and incorrect.\n", "\n", "### Scoring:\n", - "- **Accuracy**: 3/4 entries are accurate in terms of being day-active animals.\n", - "- **Relevance**: The response includes more animals than requested and repeats some animals.\n", - "- **Adherence to Instruction**: The instruction was to list three different animals, but the response included six entries with repetitions.\n", - "\n", - "Given these points, the model response partially meets the accuracy requirement but fails significantly in adherence to the instruction's format and specificity. The inclusion of incorrect information (Cobra) and unnecessary repetitions also detracts from the quality of the response.\n", + "- **Relevance to the task**: The task was to name three different animals active during the day. The response included two correct diurnal animals (squirrel, eagle) but also included incorrect and repeated entries.\n", + "- **Accuracy**: Including animals not known for being diurnal (cobra) and repeating animals reduces the accuracy.\n", + "- **Adherence to instruction**: The instruction asked for three animals, but six were provided, with repetitions.\n", "\n", "### Score: 40/100\n", - "This score reflects that while some of the response was accurate, the failure to adhere to the specific number of animals requested, the inclusion of an incorrect animal, and the repetition of animals significantly lower the quality of the response according to the given instruction.\n", + "**Reasoning**: The response partially meets the criteria by including some correct animals but fails in terms of accuracy (inclusion of non-diurnal animals), repetition of animals, and not adhering to the instruction of listing only three animals.\n", "\n", "-------------------------\n", "\n", @@ -364,15 +359,11 @@ "What is incorrect?\n", "\n", "Score:\n", - ">> The model response \"What is incorrect?\" would score relatively low on the scale for the given task. Here's the breakdown:\n", - "\n", - "1. **Understanding of Instruction**: The instruction specifically asks for a more formal rewrite of the sentence \"I need to find out what's wrong.\" The model response does not fully capture the original sentence's intent of needing to discover or ascertain the issue. Instead, it poses a direct question about what is incorrect, which changes the nature of the statement from a declaration to an inquiry. This indicates a partial misunderstanding or incomplete execution of the task.\n", + ">> The model response \"What is incorrect?\" would score relatively low on the scale for the given task. The original instruction was to rewrite the sentence \"I need to find out what's wrong\" in a more formal way. The correct output provided, \"I must ascertain what is incorrect,\" effectively increases the formality of the original sentence by using more formal vocabulary (\"must\" instead of \"need\" and \"ascertain\" instead of \"find out\") and adjusting the phrasing (\"what is incorrect\" instead of \"what's wrong\").\n", "\n", - "2. **Formality**: The response does use slightly more formal language by using \"incorrect\" instead of \"wrong.\" However, it lacks the formal structure expected in rewriting the original sentence. The original sentence's intent and structure as a statement of need (\"I need to find out...\") are not preserved.\n", + "The model response, however, only addresses part of the sentence and does not maintain the original meaning or structure. It changes the sentence into a question and omits the aspect of needing to discover or investigate the issue, which is a critical component of the original sentence. Additionally, it does not enhance the formality significantly.\n", "\n", - "3. **Completeness**: The response does not include the aspect of needing to \"find out,\" which is crucial to the original sentence. It merely asks what is incorrect, without indicating the necessity or process of discovery.\n", - "\n", - "Given these points, the response would score around **30 out of 100**. It recognizes the need for more formal language but fails to accurately and completely transform the original sentence while maintaining its intent and structure.\n", + "Given these considerations, I would score the model response around 20 out of 100. It recognizes the need to adjust the formality slightly but fails to maintain the original sentence's intent and structure, and does not fully meet the requirement of rewriting the sentence in a more formal way.\n", "\n", "-------------------------\n", "\n", @@ -384,11 +375,7 @@ "The interjection in the sentence is 'Wow'.\n", "\n", "Score:\n", - ">> The model response `The interjection in the sentence is 'Wow'.` accurately identifies the interjection in the given sentence. The response is clear, directly addresses the instruction, and correctly identifies \"Wow\" as the interjection, which is used to express surprise or admiration, fitting the context of the sentence provided.\n", - "\n", - "Score: 100/100\n", - "\n", - "The response fully meets the requirements of the task and correctly answers the question without any errors or omissions.\n", + ">> The model response `The interjection in the sentence is 'Wow'.` accurately identifies the interjection in the provided sentence. The response is clear, directly addresses the instruction, and correctly identifies \"Wow\" as the interjection. Therefore, the response should be scored 100 out of 100.\n", "\n", "-------------------------\n", "\n", @@ -402,11 +389,11 @@ "Score:\n", ">> The model response \"The type of sentence is exclamatory.\" is incorrect. The correct type of the sentence \"Did you finish the report?\" is interrogative, as it is a question. An exclamatory sentence would express strong emotion and typically ends with an exclamation mark.\n", "\n", - "Given the incorrect identification of the sentence type, the score for the model response should be low. However, the response does correctly identify a type of sentence, just not the correct one for the given input. Therefore, it shows some understanding of sentence types but fails in accurate application.\n", + "Given the incorrect identification of the sentence type, the score for the model response should be low. However, the response does correctly format the answer by stating \"The type of sentence is...\" which shows an understanding of the task's requirements but fails in accuracy.\n", "\n", - "Score: 20/100\n", + "Score: 10/100\n", "\n", - "This score reflects that the response is on topic (discussing sentence types) but incorrect in its specific application to the provided sentence.\n", + "The score reflects that the response is well-structured but fundamentally incorrect in identifying the sentence type.\n", "\n", "-------------------------\n" ] @@ -438,29 +425,22 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "3552bdfb-7511-42ac-a9ec-da672e2a5468", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0, 50, 20, 100, 0]\n" - ] - } - ], + "outputs": [], "source": [ - "def generate_model_scores(json_data, json_key):\n", + "from tqdm import tqdm\n", "\n", + "def generate_model_scores(json_data, json_key, client):\n", " scores = []\n", - " for entry in json_data:\n", - " \n", - " prompt = (f\"Given the input `{format_input(entry)}` \"\n", - " f\"and correct output `{entry['output']}`, \"\n", - " f\"score the model response `{entry[json_key]}`\"\n", - " f\" on a scale from 0 to 100, where 100 is the best score. \"\n", - " f\"Respond with the number only.\"\n", + " for entry in tqdm(json_data, desc=\"Scoring entries\"):\n", + " prompt = (\n", + " f\"Given the input `{format_input(entry)}` \"\n", + " f\"and correct output `{entry['output']}`, \"\n", + " f\"score the model response `{entry[json_key]}`\"\n", + " f\" on a scale from 0 to 100, where 100 is the best score. \"\n", + " f\"Respond with the number only.\"\n", " )\n", " score = run_chatgpt(prompt, client)\n", " try:\n", @@ -468,9 +448,7 @@ " except:\n", " continue\n", "\n", - " return scores\n", - "\n", - "print(generate_model_scores(json_data[:5], \"model 1 response\"))" + " return scores" ] }, { @@ -491,10 +469,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "4f700d4b-19e5-4404-afa7-b0f093024232", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Scoring entries: 100%|█████████████████████████████████████████████████| 100/100 [01:09<00:00, 1.44it/s]\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -502,21 +487,43 @@ "\n", "model 1 response\n", "Number of scores: 100 of 100\n", - "Average score: 73.54\n", + "Average score: 74.04\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Scoring entries: 100%|█████████████████████████████████████████████████| 100/100 [01:08<00:00, 1.46it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\n", "model 2 response\n", "Number of scores: 100 of 100\n", - "Average score: 56.52\n" + "Average score: 56.72\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" ] } ], "source": [ "for model in (\"model 1 response\", \"model 2 response\"):\n", "\n", - " scores = generate_model_scores(json_data, model)\n", + " scores = generate_model_scores(json_data, model, client)\n", " print(f\"\\n{model}\")\n", " print(f\"Number of scores: {len(scores)} of {len(json_data)}\")\n", - " print(f\"Average score: {sum(scores)/len(scores):.2f}\")" + " print(f\"Average score: {sum(scores)/len(scores):.2f}\\n\")" ] }, {