From a6eff7f94b237c87822e05d90b2f8d45a3463254 Mon Sep 17 00:00:00 2001 From: Ryan Lempka Date: Mon, 10 Nov 2025 23:04:34 +0000 Subject: [PATCH] feat: enhance evaluation notebook fine tuning --- .../embedding-finetuning/3_evaluation.ipynb | 244 +++--------------- 1 file changed, 37 insertions(+), 207 deletions(-) diff --git a/nemo/data-flywheel/embedding-finetuning/3_evaluation.ipynb b/nemo/data-flywheel/embedding-finetuning/3_evaluation.ipynb index 7640283a0..0322946a6 100644 --- a/nemo/data-flywheel/embedding-finetuning/3_evaluation.ipynb +++ b/nemo/data-flywheel/embedding-finetuning/3_evaluation.ipynb @@ -15,13 +15,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "1fb04eb2", "metadata": {}, "outputs": [], "source": [ "from time import sleep, time\n", - "from nemo_microservices import NeMoMicroservices" + "from nemo_microservices import NeMoMicroservices\n", + "from config import *" ] }, { @@ -55,18 +56,16 @@ "id": "341d85f8", "metadata": {}, "source": [ - "The following code imports necessary configurations and prints the endpoints for the NeMo Data Store, Entity Store, Customizer, Evaluator, and NIM, as well as the namespace and base model." + "The following code imports necessary configurations for the NeMo Data Store, Entity Store, Customizer, Evaluator, and NIM, as well as the namespace and base model." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "de0f4fd6", "metadata": {}, "outputs": [], "source": [ - "from config import *\n", - "\n", "# Initialize NeMo Microservices SDK client\n", "nemo_client = NeMoMicroservices(\n", " base_url=NEMO_URL,\n", @@ -84,14 +83,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "63f9b5f8", "metadata": {}, "outputs": [], "source": [ "EMBEDDING_MODEL_NAME = f\"{NMS_NAMESPACE}/{OUTPUT_MODEL_NAME_EMBEDDING}\" # update this if you used a different name\n", "\n", - "# Check if the embedding model is hosted by NVIDIA NIM\n", + "# Check if the embedding model is running locally as an NVIDIA NIM (pod in your cluster)\n", "models = nemo_client.inference.models.list()\n", "model_names = [model.id for model in models.data]\n", "\n", @@ -110,7 +109,7 @@ "\n", "For the purposes of showcasing zero-shot generalization, we will run the `SciDocs` benchmark from the [Benchmarking Information Retrieval (BEIR)](https://github.com/beir-cellar/beir) benchmark suite.\n", "\n", - "We choose the `SciDocs` benchmark because its core purpose is to assess a model's ability to find and retrieve a scientific paper that should be cited by another given paper. While this benchmark data has differences from the `SPECTER` dataset that we trained (such as the length of the passages), it is roughly in-domain of scientific data." + "We choose the `SciDocs` benchmark because its core purpose is to assess a model's ability to find and retrieve a scientific paper that should be cited by another given paper. While this benchmark data has differences from the `SPECTER` dataset we used for training (such as the length of the passages), it remains within the scientific domain and serves as a good test of the model's generalization capabilities." ] }, { @@ -125,19 +124,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "2d476a88", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embedding URL: http://nim.test/v1/embeddings\n", - "Embedding Model Name: embed-sft-ns/fullweight_sft_embedding\n" - ] - } - ], + "outputs": [], "source": [ "EMBEDDING_URL = f\"{NIM_URL}/v1/embeddings\"\n", "\n", @@ -194,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "b394267c", "metadata": {}, "outputs": [], @@ -243,18 +233,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "ca284d00", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation job created: eval-UyoRebr4tEEZnK61MgGs1W\n" - ] - } - ], + "outputs": [], "source": [ "# Create evaluation job for the base model\n", "eval_job = nemo_client.evaluation.jobs.create(\n", @@ -279,13 +261,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "046082a8", "metadata": {}, "outputs": [], "source": [ - "from time import sleep, time\n", - "\n", "def wait_eval_job(nemo_client, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n", " \"\"\"Helper for waiting an eval job.\"\"\"\n", " start_time = time()\n", @@ -318,119 +298,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "3ac11757", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Job status: running after 5.04 seconds. Progress: 0%\n", - "Job status: running after 10.05 seconds. Progress: 0%\n", - "Job status: running after 15.07 seconds. Progress: 0%\n", - "Job status: running after 20.08 seconds. Progress: 0%\n", - "Job status: running after 25.10 seconds. Progress: 0%\n", - "Job status: running after 30.11 seconds. Progress: 0%\n", - "Job status: running after 35.13 seconds. Progress: 0%\n", - "Job status: running after 40.14 seconds. Progress: 0%\n", - "Job status: running after 45.15 seconds. Progress: 0%\n", - "Job status: running after 50.17 seconds. Progress: 0%\n", - "Job status: running after 55.18 seconds. Progress: 0%\n", - "Job status: running after 60.19 seconds. Progress: 0%\n", - "Job status: running after 65.21 seconds. Progress: 0%\n", - "Job status: running after 70.22 seconds. Progress: 0%\n", - "Job status: running after 75.24 seconds. Progress: 0%\n", - "Job status: running after 80.25 seconds. Progress: 0%\n", - "Job status: running after 85.27 seconds. Progress: 0%\n", - "Job status: running after 90.28 seconds. Progress: 0%\n", - "Job status: running after 95.29 seconds. Progress: 0%\n", - "Job status: running after 100.31 seconds. Progress: 0%\n", - "Job status: running after 105.32 seconds. Progress: 0%\n", - "Job status: running after 110.34 seconds. Progress: 0%\n", - "Job status: running after 115.35 seconds. Progress: 0%\n", - "Job status: running after 120.36 seconds. Progress: 0%\n", - "Job status: running after 125.38 seconds. Progress: 0%\n", - "Job status: running after 130.39 seconds. Progress: 0%\n", - "Job status: running after 135.41 seconds. Progress: 0%\n", - "Job status: running after 140.42 seconds. Progress: 0%\n", - "Job status: running after 145.43 seconds. Progress: 0%\n", - "Job status: running after 150.45 seconds. Progress: 0%\n", - "Job status: running after 155.46 seconds. Progress: 0%\n", - "Job status: running after 160.48 seconds. Progress: 0%\n", - "Job status: running after 165.49 seconds. Progress: 0%\n", - "Job status: running after 170.51 seconds. Progress: 0%\n", - "Job status: running after 175.52 seconds. Progress: 0%\n", - "Job status: running after 180.54 seconds. Progress: 0%\n", - "Job status: running after 185.55 seconds. Progress: 0%\n", - "Job status: running after 190.56 seconds. Progress: 0%\n", - "Job status: running after 195.58 seconds. Progress: 0%\n", - "Job status: running after 200.59 seconds. Progress: 0%\n", - "Job status: running after 205.61 seconds. Progress: 0%\n", - "Job status: running after 210.62 seconds. Progress: 0%\n", - "Job status: running after 215.63 seconds. Progress: 0%\n", - "Job status: running after 220.65 seconds. Progress: 0%\n", - "Job status: running after 225.67 seconds. Progress: 0%\n", - "Job status: running after 230.68 seconds. Progress: 0%\n", - "Job status: running after 235.70 seconds. Progress: 0%\n", - "Job status: running after 240.71 seconds. Progress: 0%\n", - "Job status: running after 245.73 seconds. Progress: 0%\n", - "Job status: running after 250.74 seconds. Progress: 0%\n", - "Job status: running after 255.75 seconds. Progress: 0%\n", - "Job status: running after 260.77 seconds. Progress: 0%\n", - "Job status: running after 265.78 seconds. Progress: 0%\n", - "Job status: running after 270.79 seconds. Progress: 0%\n", - "Job status: running after 275.81 seconds. Progress: 0%\n", - "Job status: running after 280.82 seconds. Progress: 0%\n", - "Job status: running after 285.84 seconds. Progress: 0%\n", - "Job status: running after 290.85 seconds. Progress: 0%\n", - "Job status: running after 295.87 seconds. Progress: 0%\n", - "Job status: running after 300.89 seconds. Progress: 0%\n", - "Job status: running after 305.90 seconds. Progress: 0%\n", - "Job status: running after 310.92 seconds. Progress: 0%\n", - "Job status: running after 315.93 seconds. Progress: 0%\n", - "Job status: running after 320.95 seconds. Progress: 0%\n", - "Job status: running after 325.96 seconds. Progress: 0%\n", - "Job status: running after 330.97 seconds. Progress: 0%\n", - "Job status: running after 335.98 seconds. Progress: 0%\n", - "Job status: running after 341.00 seconds. Progress: 0%\n", - "Job status: running after 346.01 seconds. Progress: 0%\n", - "Job status: running after 351.03 seconds. Progress: 0%\n", - "Job status: running after 356.04 seconds. Progress: 0%\n", - "Job status: running after 361.05 seconds. Progress: 0%\n", - "Job status: running after 366.07 seconds. Progress: 0%\n", - "Job status: running after 371.08 seconds. Progress: 0%\n", - "Job status: running after 376.10 seconds. Progress: 0%\n", - "Job status: running after 381.11 seconds. Progress: 0%\n", - "Job status: running after 386.13 seconds. Progress: 0%\n", - "Job status: running after 391.14 seconds. Progress: 0%\n", - "Job status: running after 396.15 seconds. Progress: 0%\n", - "Job status: running after 401.17 seconds. Progress: 0%\n", - "Job status: running after 406.18 seconds. Progress: 0%\n", - "Job status: running after 411.20 seconds. Progress: 0%\n", - "Job status: running after 416.21 seconds. Progress: 0%\n", - "Job status: running after 421.23 seconds. Progress: 0%\n", - "Job status: running after 426.24 seconds. Progress: 0%\n", - "Job status: running after 431.26 seconds. Progress: 0%\n", - "Job status: running after 436.28 seconds. Progress: 0%\n", - "Job status: running after 441.29 seconds. Progress: 0%\n", - "Job status: running after 446.31 seconds. Progress: 0%\n", - "Job status: running after 451.32 seconds. Progress: 0%\n", - "Job status: running after 456.34 seconds. Progress: 0%\n", - "Job status: running after 461.35 seconds. Progress: 0%\n", - "Job status: running after 466.36 seconds. Progress: 0%\n", - "Job status: running after 471.38 seconds. Progress: 0%\n", - "Job status: running after 476.39 seconds. Progress: 0%\n", - "Job status: running after 481.41 seconds. Progress: 0%\n", - "Job status: running after 486.42 seconds. Progress: 0%\n", - "Job status: running after 491.43 seconds. Progress: 0%\n", - "Job status: running after 496.45 seconds. Progress: 0%\n", - "Job status: running after 501.46 seconds. Progress: 0%\n", - "Job status: running after 506.47 seconds. Progress: 0%\n", - "Job status: completed after 511.49 seconds. Progress: 100%\n" - ] - } - ], + "outputs": [], "source": [ "job = wait_eval_job(nemo_client, eval_job.id, polling_interval=5, timeout=5000)" ] @@ -445,64 +316,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "b0ad1a0e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"job\": \"eval-UyoRebr4tEEZnK61MgGs1W\",\n", - " \"id\": \"evaluation_result-AjnxBhHBxyXEJzUFDjJfV6\",\n", - " \"created_at\": \"2025-07-29T17:59:19.601878\",\n", - " \"custom_fields\": {},\n", - " \"files_url\": \"hf://datasets/evaluation-results/eval-UyoRebr4tEEZnK61MgGs1W\",\n", - " \"namespace\": \"default\",\n", - " \"tasks\": {\n", - " \"my-beir-task\": {\n", - " \"metrics\": {\n", - " \"retriever.ndcg_cut_10\": {\n", - " \"scores\": {\n", - " \"ndcg_cut_10\": {\n", - " \"value\": 0.2392602974177994,\n", - " \"stats\": {}\n", - " }\n", - " }\n", - " },\n", - " \"retriever.ndcg_cut_5\": {\n", - " \"scores\": {\n", - " \"ndcg_cut_5\": {\n", - " \"value\": 0.1973427260255094,\n", - " \"stats\": {}\n", - " }\n", - " }\n", - " },\n", - " \"retriever.recall_10\": {\n", - " \"scores\": {\n", - " \"recall_10\": {\n", - " \"value\": 0.25328333333333275,\n", - " \"stats\": {}\n", - " }\n", - " }\n", - " },\n", - " \"retriever.recall_5\": {\n", - " \"scores\": {\n", - " \"recall_5\": {\n", - " \"value\": 0.17646666666666633,\n", - " \"stats\": {}\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " },\n", - " \"updated_at\": \"2025-07-29T17:59:19.601881\"\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "results = nemo_client.evaluation.jobs.results(job_id=eval_job.id)\n", "print(results.model_dump_json(indent=2, exclude_unset=True))" @@ -510,20 +327,33 @@ }, { "cell_type": "markdown", - "id": "cf73bb97", + "id": "3cc76a48", "metadata": {}, "source": [ - "After a single short finetuning run, we see a `recall@5` score of around `0.176`. Note that the `SciDocs` task is considered a challenging zero-shot benchmark, with the SOTA for recall@5 close to the `0.2` mark. \n", + "---\n", + "\n", + "## Next Steps\n", + "\n", + "✅ **Completed in this notebook:**\n", + "- Configured evaluation target for the fine-tuned embedding model\n", + "- Created evaluation configuration for the BEIR SciDocs benchmark\n", + "- Ran evaluation job measuring recall@5, recall@10, NDCG@5, and NDCG@10 metrics\n", + "- Analyzed results: recall@5 improved from approximately 0.159 (baseline `nvidia/llama-3_2-nv-embedqa-1b-v2`) to 0.176\n", + "\n", + "**What you've achieved:**\n", "\n", - "For comparison, the baseline model (`nvidia/llama-3_2-nv-embedqa-1b-v2`) we used had a recall@5 score of around `0.159` on this task. If interested in scoring the base model used for finetuning, you can either deploy the NIM yourself, or point to managed endpoints at [build.nvidia.com](https://build.nvidia.com/nvidia/llama-3_2-nv-embedqa-1b-v2) in your target configuration.\n", + "Through this three-part tutorial series, you've completed the full embedding fine-tuning workflow: prepared domain-specific training data, fine-tuned `nvidia/llama-3.2-nv-embedqa-1b-v2` for improved scientific retrieval, deployed your custom model as a NIM, and evaluated performance on the challenging SciDocs zero-shot benchmark.\n", "\n", - "With a quick finetuning run, we were able to further boost the score over `nvidia/llama-3_2-nv-embedqa-1b-v2` what is already an excellent starting point. " + "**Next:**\n", + "- Explore other [NeMo Microservices tutorials](../../../README.md) for LLM customization, RAG evaluation, and guardrails\n", + "- Visit the [NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) to learn more about advanced features\n", + "- Apply these techniques to your own domain-specific datasets for even better retrieval quality\n" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "nemo_env", "language": "python", "name": "python3" }, @@ -537,7 +367,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.12" + "version": "3.13.7" } }, "nbformat": 4,