Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
86424a2
♻️ use fp8 model for testing SB + CB
prashantgupta24 Aug 4, 2025
1b600fe
🎨 typo
prashantgupta24 Aug 4, 2025
b3a507f
🔥 rip out gptq stuff
prashantgupta24 Aug 4, 2025
0939fdf
🔥 rip out gptq stuff
prashantgupta24 Aug 4, 2025
8ac83d5
🚧 temporarily install fms main
prashantgupta24 Aug 4, 2025
4f69dfc
⏪ bring back quantized marker
prashantgupta24 Aug 4, 2025
4aa46f1
🎨 typo?
prashantgupta24 Aug 4, 2025
df6e3f0
🐛 typo
prashantgupta24 Aug 4, 2025
a8bd0ed
🐛 linear_type is needed
prashantgupta24 Aug 4, 2025
c284fad
⬆️ ibm-fms 1.2.0
prashantgupta24 Aug 5, 2025
7bd7a3a
⬆️ bump tolerance for CPU tests
prashantgupta24 Aug 5, 2025
891a655
🐛 fix up some params
prashantgupta24 Aug 5, 2025
baa2b80
🚧 don't run TP with FP
prashantgupta24 Aug 5, 2025
bdb09bd
➕ fms-model-optimizer[fp8]
prashantgupta24 Aug 5, 2025
7230e0e
🚧 omit cb ones for now too
prashantgupta24 Aug 5, 2025
620f98e
:recycle: Add default fp8 test model
joerunde Aug 6, 2025
d8fe05b
:bug: xfail fp8 tests on spyre
joerunde Aug 6, 2025
bfe4015
:arrow_up: bump fms packages
joerunde Aug 7, 2025
00336bc
:bug: separate out CB/SB concerns for fp8
joerunde Aug 7, 2025
545ef84
:goal_net: disable fp8 on CB
joerunde Aug 7, 2025
46ede6a
:goal_net: skip fp8 prompt logprobs test
joerunde Aug 7, 2025
5893df1
:alembic: try only basic tests for fp8
joerunde Aug 8, 2025
b1186d3
:alembic: select one test and check duration
joerunde Aug 8, 2025
1f4b8bf
:bug: update to math_fp8 attention
joerunde Aug 8, 2025
5df9f2c
:zap: limit testing to only tp2 test
joerunde Aug 8, 2025
51a3913
:art: lint
joerunde Aug 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: "static batching"
markers: "cpu and decoder and not cb"
flags: "--timeout=300"
- name: "quantized"
- name: "fp8"
markers: "cpu and quantized"
flags: "--timeout=300"
- name: "embedding"
Expand Down Expand Up @@ -178,6 +178,6 @@ jobs:
# `uv run`, to avoid having `uv run` re-sync any dependencies or
# re-install the vllm_sypre package from source
source .venv/bin/activate

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess a better way to do this would be to select the quantized model when using -m quantized 🤷

python3 -m pytest ${{ matrix.test_suite.flags }} \
tests -v -m "${{ matrix.test_suite.markers }}"
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ description = "vLLM plugin for Spyre hardware support"
readme = "README.md"
license = {text = "Apache 2"}
dependencies = [
"fms-model-optimizer>=0.2.0",
"ibm-fms==1.1.0",
"fms-model-optimizer[fp8]>=0.6.0",
"ibm-fms>=1.2.1",
"vllm>=0.9.2",
]
requires-python = ">=3.9"
Expand Down
81 changes: 68 additions & 13 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,67 @@


def pytest_collection_modifyitems(config, items):
""" Mark all tests in e2e directory"""
""" Modify tests at collection time """
_mark_all_e2e(items)

_skip_quantized_by_default(config, items)

_xfail_fp8_on_spyre(items)

_skip_all_cb_and_fp8_tests(items)


def _mark_all_e2e(items):
"""Mark all tests within the e2e package with the e2e marker"""
for item in items:
if "e2e" in str(item.nodeid):
item.add_marker(pytest.mark.e2e)


def _skip_quantized_by_default(config, items):
"""Skip tests marked with `quantized` unless the `-m` flag includes it
Ref: https://stackoverflow.com/questions/56374588/how-can-i-ensure-tests-with-a-marker-are-only-run-if-explicitly-asked-in-pytest

This will skip the quantized tests at runtime, but they will still show up
as collected when running pytest --collect-only.
"""
markexpr = config.option.markexpr
if "quantized" in markexpr:
return # let pytest handle the collection logic

skip_mymarker = pytest.mark.skip(reason='quantized not selected')
for item in items:
if "quantized" in item.keywords:
item.add_marker(skip_mymarker)


def _xfail_fp8_on_spyre(items):
"""Set an xfail marker on all tests that run quantized models on Spyre
hardware.

TODO: Relax this to only "spyre and cb" once static batching is supported
on spyre.
"""

xfail_marker = pytest.mark.xfail(
reason="fp8 is not yet supported on Spyre")
for item in items:
if "quantized" in item.keywords and "spyre" in item.keywords:
item.add_marker(xfail_marker)


def _skip_all_cb_and_fp8_tests(items):
"""Skip all tests that run fp8 with continuous batching.
This can be relaxed once the TODOs to implement fp8 paged attention are
resolved.
"""
skip_marker = pytest.mark.skip(
reason="FP8 is not supported with continuous batching yet")
for item in items:
if "quantized" in item.keywords and "cb" in item.keywords:
item.add_marker(skip_marker)


@pytest.fixture(autouse=True)
def init_test_http_connection():
# pytest_asyncio may use a different event loop per test
Expand Down Expand Up @@ -79,7 +134,18 @@ def remote_openai_server(request):
raise pytest.UsageError(
"Error setting up remote_openai_server params") from e

if 'cb' in params:
# Default to None if not present
quantization = params.get("quantization", None)

# Add extra server args if present in test
server_args = ["--quantization", quantization] if quantization else []

if 'tp_size' in params:
tp_size = params['tp_size']
skip_unsupported_tp_size(int(tp_size), backend)
server_args.extend(["--tensor-parallel-size", str(tp_size)])

if "cb" in params and params["cb"] == 1:
max_model_len = params["max_model_len"]
max_num_seqs = params["max_num_seqs"]
env_dict = {
Expand Down Expand Up @@ -108,17 +174,6 @@ def remote_openai_server(request):
backend,
}

# Default to None if not present
quantization = params.get('quantization', None)

# Add extra server args if present in test
server_args = ["--quantization", quantization] if quantization else []

if 'tp_size' in params:
tp_size = params['tp_size']
skip_unsupported_tp_size(int(tp_size), backend)
server_args.extend(["--tensor-parallel-size", str(tp_size)])

try:
with RemoteOpenAIServer(model, server_args,
env_dict=env_dict) as server:
Expand Down
127 changes: 35 additions & 92 deletions tests/e2e/test_spyre_online.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did a major refactoring on this file just because we were using the same logic for the tests but different parameters. Do double check and let me know if I missed something!

Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,39 @@


@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize("tp_size", [
pytest.param(1, marks=pytest.mark.basic),
pytest.param(2, marks=pytest.mark.multi),
pytest.param(4, marks=pytest.mark.multi),
pytest.param(8, marks=pytest.mark.multi),
],
ids=lambda val: f"TP({val})")
@pytest.mark.parametrize(
"tp_size",
[
pytest.param(1, marks=pytest.mark.basic),
pytest.param(2, marks=pytest.mark.multi),
pytest.param(4, marks=pytest.mark.multi),
pytest.param(8, marks=pytest.mark.multi),
],
ids=lambda val: f"TP({val})",
)
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("warmup_shape", [[
(64, 20, 1),
]])
def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
tp_size):
@pytest.mark.parametrize(
"warmup_shape",
[[
(64, 20, 1),
]],
)
@pytest.mark.parametrize("cb",
[pytest.param(1, marks=pytest.mark.cb, id="cb"), 0])
@pytest.mark.parametrize("max_num_seqs", [2],
ids=lambda val: f"max_num_seqs({val})")
@pytest.mark.parametrize("max_model_len", [256],
ids=lambda val: f"max_model_len({val})")
def test_openai_serving(
remote_openai_server,
model,
warmup_shape,
backend,
tp_size,
cb,
max_num_seqs,
max_model_len,
):
"""Test online serving using the `vllm serve` CLI"""

client = remote_openai_server.get_client()
Expand All @@ -35,6 +55,9 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
assert len(completion.choices) == 2
assert len(completion.choices[0].text) > 0

# rest are SB tests
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think?

if cb:
return
# Check some basic error handling as well. This is all done in one test
# now to avoid server boot-up overhead to test each case.
# To change this we'll need:
Expand All @@ -55,83 +78,3 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
max_tokens=25)
except openai.BadRequestError as e:
assert "warmup" in str(e)


@pytest.mark.skip(reason="Test disabled until a model is available")
@pytest.mark.parametrize("model", get_spyre_model_list(quantized="gptq"))
@pytest.mark.parametrize("backend", ["sendnn"])
@pytest.mark.parametrize("quantization", ["gptq"])
@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
def test_openai_serving_gptq(remote_openai_server, model, backend,
warmup_shape, quantization):
"""Test online serving a GPTQ model with the sendnn backend only"""

client = remote_openai_server.get_client()
completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=0.0)
assert len(completion.choices) == 1
assert len(completion.choices[0].text) > 0

completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=1.0,
n=2)
assert len(completion.choices) == 2
assert len(completion.choices[0].text) > 0


@pytest.mark.parametrize("model", get_spyre_model_list(quantized="fp8"))
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
def test_openai_serving_fp8(remote_openai_server, model, backend,
warmup_shape):
"""Test online serving an FP8 model"""

client = remote_openai_server.get_client()
completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=0.0)
assert len(completion.choices) == 1
assert len(completion.choices[0].text) > 0

completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=1.0,
n=2)
assert len(completion.choices) == 2
assert len(completion.choices[0].text) > 0


@pytest.mark.basic
@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize("cb",
[pytest.param(1, marks=pytest.mark.cb, id="cb")])
@pytest.mark.parametrize("max_num_seqs", [2],
ids=lambda val: f"max_num_seqs({val})")
@pytest.mark.parametrize("max_model_len", [256],
ids=lambda val: f"max_model_len({val})")
@pytest.mark.parametrize("backend", get_spyre_backend_list())
def test_openai_serving_cb(remote_openai_server, model, backend, cb,
max_num_seqs, max_model_len):
"""Test online serving with CB using the `vllm serve` CLI"""

client = remote_openai_server.get_client()
completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=0.0)
assert len(completion.choices) == 1
assert len(completion.choices[0].text) > 0

completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=1.0,
n=2)
assert len(completion.choices) == 2
assert len(completion.choices[0].text) > 0
2 changes: 2 additions & 0 deletions tests/e2e/test_spyre_prompt_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_prompt_logprobs(
implementation using huggingface.
'''
skip_unsupported_tp_size(tp_size, backend)
if "FP8" in model:
pytest.skip(reason="Prompt logprobs does not support FP8")
num_prompt_logprobs = 5

prompts = get_chicken_soup_prompts(4)
Expand Down
60 changes: 30 additions & 30 deletions tests/spyre_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@

DISABLE_ASSERTS = False # used for debugging

ISCLOSE_REL_TOL_CPU = 0.2
# TODO: Needs to be separate for quantized models
ISCLOSE_REL_TOL_CPU = 0.35
ISCLOSE_REL_TOL_SPYRE = 0.35


Expand Down Expand Up @@ -513,36 +514,14 @@ def get_spyre_backend_list():
# get model names from env, if not set then use default models for each type.
# Multiple models can be specified with a comma separated list in
# VLLM_SPYRE_TEST_MODEL_LIST
def get_spyre_model_list(isEmbeddings=False, quantized=None):
spyre_model_dir_path = get_spyre_model_dir_path()

def _get_or_default(env: str, default: str) -> str:
"""Handle empty strings in env var"""
val = os.environ.get(env, default)
if not val:
val = default
return val
def get_spyre_model_list(isEmbeddings=False):
user_test_model_list = os.environ.get("VLLM_SPYRE_TEST_MODEL_LIST")
if not user_test_model_list:
return _default_test_models(isEmbeddings)

if isEmbeddings:
user_test_model_list = _get_or_default(
"VLLM_SPYRE_TEST_MODEL_LIST",
"sentence-transformers/all-roberta-large-v1")
marks = [pytest.mark.embedding]
elif quantized == "gptq":
# TODO: need a HF hub reference here as a default
user_test_model_list = _get_or_default("VLLM_SPYRE_TEST_MODEL_LIST",
"granite-3.0-8b-instruct-gptq")
marks = [pytest.mark.decoder, pytest.mark.quantized, pytest.mark.spyre]
elif quantized == "fp8":
user_test_model_list = _get_or_default(
"VLLM_SPYRE_TEST_MODEL_LIST",
"ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8")
marks = [pytest.mark.decoder, pytest.mark.quantized]
else:
user_test_model_list = _get_or_default(
"VLLM_SPYRE_TEST_MODEL_LIST",
"ibm-ai-platform/micro-g3.3-8b-instruct-1b")
marks = [pytest.mark.decoder]
# User overridden model list
spyre_model_dir_path = get_spyre_model_dir_path()
marks = [pytest.mark.embedding] if isEmbeddings else [pytest.mark.decoder]

test_model_list = []
for model in user_test_model_list.split(","):
Expand All @@ -552,6 +531,27 @@ def _get_or_default(env: str, default: str) -> str:
return test_model_list


def _default_test_models(isEmbeddings=False):
"""Return the default set of test models as pytest parameterizations"""
if isEmbeddings:
model = "sentence-transformers/all-roberta-large-v1"
return [pytest.param(model, marks=[pytest.mark.embedding], id=model)]

# Decoders
# We run tests for both the full-precision bf16 and fp8-quantized models,
# but by default the `pytest.mark.quantized` marker is de-selected unless
# the test command includes `-m quantized`.
tinygranite = "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
tinygranite_fp8 = "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
params = [
pytest.param(tinygranite, marks=[pytest.mark.decoder], id=tinygranite),
pytest.param(tinygranite_fp8,
marks=[pytest.mark.decoder, pytest.mark.quantized],
id=tinygranite_fp8)
]
return params


def create_text_prompt(model: str, min_tokens: int, max_tokens: int) -> str:
"""Create a text prompt for the specified model that will tokenize to within
the specified token length range."""
Expand Down
Loading
Loading