Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/nemo-evaluator/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ repository = "https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator"
# END(if-changed)

[dependency-groups]
test = ["pytest", "pytest-cov", "pytest-subtests", "pytest-httpserver"]
test = ["pytest", "pytest-cov", "pytest-subtests", "pytest-httpserver", "nvidia-simple-evals"]

docs = [
"sphinx",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
set -xeuo pipefail # Exit immediately if a command exits with a non-zero status

export CUDA_VISIBLE_DEVICES=""
export HF_HOME="/home/TestData/HF_HOME"
export HF_DATASETS_OFFLINE="1"
export TRANSFORMERS_OFFLINE="1"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"

SCRIPT_DIR=$(dirname "$0")
PROJECT_DIR=$SCRIPT_DIR/../../
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import pytest
import requests
from flask import Flask, jsonify, request

from nemo_evaluator.logging.utils import logger


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""End-to-end tests for eval-factory using fake endpoint."""

import pytest

from nemo_evaluator.logging.utils import logger


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from pathlib import Path

import pytest

from nemo_evaluator.logging.utils import logger


Expand Down Expand Up @@ -132,7 +133,7 @@ def test_core_interceptor_chain(self, fake_endpoint, fake_url):
]

logger.info(f"Testing core interceptors runtime behavior: {' '.join(cmd)}")
subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=60)
subprocess.run(cmd, capture_output=False, text=True, env=env, timeout=60)

# Check ONLY runtime behavior - what the interceptors actually DO
log_dir = Path(self.test_log_dir)
Expand Down Expand Up @@ -242,7 +243,7 @@ def test_comprehensive_interceptor_chain(self, fake_endpoint, fake_url):

logger.info(f"Testing all interceptors runtime behavior: {' '.join(cmd)}")
result = subprocess.run(
cmd, capture_output=True, text=True, env=env, timeout=60
cmd, capture_output=False, text=True, env=env, timeout=60
)
logger.info("Finished the subprocess", result=result)

Expand Down Expand Up @@ -438,7 +439,7 @@ def test_comprehensive_interceptor_chain(self, fake_endpoint, fake_url):
)

result2 = subprocess.run(
cmd2, capture_output=True, text=True, env=env, timeout=60
cmd2, capture_output=False, text=True, env=env, timeout=60
)
logger.info("Finished the second subprocess run", result2)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,32 +32,10 @@ def n():


@pytest.fixture(autouse=True)
def installed_modules(n: int):
def installed_modules(n: int, monkeypatch):
if not n:
# Temporarily uninstall nvidia-simple-evals to make it unavailable
import subprocess

try:
subprocess.run(
[sys.executable, "-m", "pip", "uninstall", "-y", "nvidia-simple-evals"],
capture_output=True,
check=False,
)
except Exception:
pass # Ignore any errors during uninstall

monkeypatch.setitem(sys.modules, "core_evals", None)
yield

# Reinstall the package
try:
subprocess.run(
[sys.executable, "-m", "pip", "install", "nvidia-simple-evals"],
capture_output=True,
check=False,
)
except Exception:
pass # Ignore any errors during reinstall

return

pkg = "core_evals"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
set -xeuo pipefail # Exit immediately if a command exits with a non-zero status

export CUDA_VISIBLE_DEVICES=""
export HF_HOME="/home/TestData/HF_HOME"
export HF_DATASETS_OFFLINE="1"
export TRANSFORMERS_OFFLINE="1"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"

SCRIPT_DIR=$(dirname "$0")
PROJECT_DIR=$SCRIPT_DIR/../../
Expand Down
1,425 changes: 1,421 additions & 4 deletions packages/nemo-evaluator/uv.lock

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions tests/integration_tests/Launch_Integration_Tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -xeuo pipefail # Exit immediately if a command exits with a non-zero status

export CUDA_VISIBLE_DEVICES="0"
export HF_HOME="/home/TestData/HF_HOME"
export HF_DATASETS_OFFLINE="1"
export TRANSFORMERS_OFFLINE="1"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"

mkdir -p /checkpoints && \
ln -s /home/TestData/nemo2_ckpt/llama-3_2-1b-instruct_v2.0 /checkpoints/llama-3_2-1b-instruct_v2.0


SCRIPT_DIR=$(dirname "$0")
PROJECT_DIR=$SCRIPT_DIR/../../
cd $PROJECT_DIR

coverage run \
--data-file=.coverage.integration_tests \
--source=src/ \
-m pytest \
-o log_cli=true \
-o log_cli_level=INFO \
-m "not pleasefixme" \
tests/integration_tests
coverage combine -q
135 changes: 135 additions & 0 deletions tests/integration_tests/nemo_fw/test_hf_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import signal
import subprocess

import pytest
from nemo_evaluator.api import check_endpoint, evaluate
from nemo_evaluator.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EvaluationConfig,
EvaluationResult,
EvaluationTarget,
)

# FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
from nvidia_eval_commons.api.api_dataclasses import (
EvaluationResult as LegacyEvaluationResult,
)

logger = logging.getLogger(__name__)


@pytest.fixture(scope="module", autouse=True)
def deployment_process():
"""Fixture to deploy the HF model on Ray server."""
hf_model_id = "meta-llama/Llama-3.2-1B-Instruct"
model_name = "hf_model"
port = 8886
# Run deployment
deploy_proc = subprocess.Popen(
[
"python",
"/opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_hf.py",
"--model_path",
hf_model_id,
"--model_id",
model_name,
"--port",
str(port),
]
)

completions_ready = check_endpoint(
endpoint_url=f"http://0.0.0.0:{port}/v1/completions/",
endpoint_type="completions",
model_name=model_name,
max_retries=600,
)
assert completions_ready, (
"Completions endpoint is not ready. Please look at the deploy process log for the error"
)

chat_ready = check_endpoint(
endpoint_url=f"http://0.0.0.0:{port}/v1/chat/completions",
endpoint_type="chat",
model_name=model_name,
max_retries=600,
)
assert chat_ready, (
"Chat endpoint is not ready. Please look at the deploy process log for the error"
)

yield deploy_proc # We only need the process reference for cleanup

deploy_proc.send_signal(signal.SIGINT)
try:
deploy_proc.wait(timeout=30)
except subprocess.TimeoutExpired:
try:
os.killpg(deploy_proc.pid, signal.SIGKILL)
except ProcessLookupError:
pass
finally:
subprocess.run(["pkill", f"-{signal.SIGTERM}", "tritonserver"], check=False)


@pytest.mark.run_only_on("GPU")
@pytest.mark.parametrize(
"eval_type,endpoint_type,eval_params",
[
("gsm8k", "completions", {"limit_samples": 1, "request_timeout": 360}),
# TODO(martas): enable once logprobs are supported
# (
# "arc_challenge",
# "completions",
# {
# "limit_samples": 1,
# "request_timeout": 360,
# "extra": {
# "tokenizer_backend": "huggingface",
# "tokenizer": "meta-llama/Llama-3.2-1B-Instruct",
# },
# },
# ),
("ifeval", "chat", {"limit_samples": 1, "request_timeout": 360}),
],
)
def test_evaluation(eval_type, endpoint_type, eval_params, tmp_path):
"""
Test evaluation of a nemo model deployed with triton backend.
"""
port = 8886
if endpoint_type == "completions":
url = f"http://0.0.0.0:{port}/v1/completions/"
elif endpoint_type == "chat":
url = f"http://0.0.0.0:{port}/v1/chat/completions"
else:
raise ValueError(f"Invalid endpoint type: {endpoint_type}")

api_endpoint = ApiEndpoint(url=url, type=endpoint_type, model_id="megatron_model")
eval_target = EvaluationTarget(api_endpoint=api_endpoint)
eval_config = EvaluationConfig(
type=eval_type, params=ConfigParams(**eval_params), output_dir=str(tmp_path)
)
results = evaluate(target_cfg=eval_target, eval_cfg=eval_config)
# FIXME(martas): EF packages pre 25.09 use old imports from nvidia_eval_commons
assert isinstance(results, EvaluationResult) or isinstance(
results, LegacyEvaluationResult
)
logger.info("Evaluation completed.")
Loading