vllm-project · joerunde · Aug 8, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
@@ -41,7 +41,7 @@ jobs:
           - name: "static batching"
             markers: "cpu and decoder and not cb"
             flags: "--timeout=300"
-          - name: "quantized"
+          - name: "fp8"
             markers: "cpu and quantized"
             flags: "--timeout=300"
           - name: "embedding"
@@ -178,6 +178,6 @@ jobs:
           # `uv run`, to avoid having `uv run` re-sync any dependencies or 
           # re-install the vllm_sypre package from source
           source .venv/bin/activate
-
+       
           python3 -m pytest ${{ matrix.test_suite.flags }} \
             tests -v -m "${{ matrix.test_suite.markers }}"
@@ -11,8 +11,8 @@ description = "vLLM plugin for Spyre hardware support"
 readme = "README.md"
 license = {text = "Apache 2"}
 dependencies = [
-    "fms-model-optimizer>=0.2.0",
-    "ibm-fms==1.1.0",
+    "fms-model-optimizer[fp8]>=0.6.0",
+    "ibm-fms>=1.2.1",
     "vllm>=0.9.2",
 ]
 requires-python = ">=3.9"

@@ -16,12 +16,67 @@
 
 
 def pytest_collection_modifyitems(config, items):
-    """ Mark all tests in e2e directory"""
+    """ Modify tests at collection time """
+    _mark_all_e2e(items)
+
+    _skip_quantized_by_default(config, items)
+
+    _xfail_fp8_on_spyre(items)
+
+    _skip_all_cb_and_fp8_tests(items)
+
+
+def _mark_all_e2e(items):
+    """Mark all tests within the e2e package with the e2e marker"""
     for item in items:
         if "e2e" in str(item.nodeid):
             item.add_marker(pytest.mark.e2e)
 
 
+def _skip_quantized_by_default(config, items):
+    """Skip tests marked with `quantized` unless the `-m` flag includes it
+    Ref: https://stackoverflow.com/questions/56374588/how-can-i-ensure-tests-with-a-marker-are-only-run-if-explicitly-asked-in-pytest
+
+    This will skip the quantized tests at runtime, but they will still show up
+    as collected when running pytest --collect-only.
+    """
+    markexpr = config.option.markexpr
+    if "quantized" in markexpr:
+        return  # let pytest handle the collection logic
+
+    skip_mymarker = pytest.mark.skip(reason='quantized not selected')
+    for item in items:
+        if "quantized" in item.keywords:
+            item.add_marker(skip_mymarker)
+
+
+def _xfail_fp8_on_spyre(items):
+    """Set an xfail marker on all tests that run quantized models on Spyre
+    hardware.
+
+    TODO: Relax this to only "spyre and cb" once static batching is supported
+    on spyre.
+    """
+
+    xfail_marker = pytest.mark.xfail(
+        reason="fp8 is not yet supported on Spyre")
+    for item in items:
+        if "quantized" in item.keywords and "spyre" in item.keywords:
+            item.add_marker(xfail_marker)
+
+
+def _skip_all_cb_and_fp8_tests(items):
+    """Skip all tests that run fp8 with continuous batching.
+    This can be relaxed once the TODOs to implement fp8 paged attention are
+    resolved.
+    """
+    skip_marker = pytest.mark.skip(
+        reason="FP8 is not supported with continuous batching yet")
+    for item in items:
+        if "quantized" in item.keywords and "cb" in item.keywords:
+            item.add_marker(skip_marker)
+
+
 @pytest.fixture(autouse=True)
 def init_test_http_connection():
     # pytest_asyncio may use a different event loop per test
@@ -79,7 +134,18 @@ def remote_openai_server(request):
         raise pytest.UsageError(
             "Error setting up remote_openai_server params") from e
 
-    if 'cb' in params:
+    # Default to None if not present
+    quantization = params.get("quantization", None)
+
+    # Add extra server args if present in test
+    server_args = ["--quantization", quantization] if quantization else []
+
+    if 'tp_size' in params:
+        tp_size = params['tp_size']
+        skip_unsupported_tp_size(int(tp_size), backend)
+        server_args.extend(["--tensor-parallel-size", str(tp_size)])
+
+    if "cb" in params and params["cb"] == 1:
         max_model_len = params["max_model_len"]
         max_num_seqs = params["max_num_seqs"]
         env_dict = {
@@ -108,17 +174,6 @@ def remote_openai_server(request):
             backend,
         }
 
-        # Default to None if not present
-        quantization = params.get('quantization', None)
-
-        # Add extra server args if present in test
-        server_args = ["--quantization", quantization] if quantization else []
-
-        if 'tp_size' in params:
-            tp_size = params['tp_size']
-            skip_unsupported_tp_size(int(tp_size), backend)
-            server_args.extend(["--tensor-parallel-size", str(tp_size)])
-
     try:
         with RemoteOpenAIServer(model, server_args,
                                 env_dict=env_dict) as server:

@@ -4,19 +4,39 @@
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize("tp_size", [
-    pytest.param(1, marks=pytest.mark.basic),
-    pytest.param(2, marks=pytest.mark.multi),
-    pytest.param(4, marks=pytest.mark.multi),
-    pytest.param(8, marks=pytest.mark.multi),
-],
-                         ids=lambda val: f"TP({val})")
+@pytest.mark.parametrize(
+    "tp_size",
+    [
+        pytest.param(1, marks=pytest.mark.basic),
+        pytest.param(2, marks=pytest.mark.multi),
+        pytest.param(4, marks=pytest.mark.multi),
+        pytest.param(8, marks=pytest.mark.multi),
+    ],
+    ids=lambda val: f"TP({val})",
+)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("warmup_shape", [[
-    (64, 20, 1),
-]])
-def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
-                        tp_size):
+@pytest.mark.parametrize(
+    "warmup_shape",
+    [[
+        (64, 20, 1),
+    ]],
+)
+@pytest.mark.parametrize("cb",
+                         [pytest.param(1, marks=pytest.mark.cb, id="cb"), 0])
+@pytest.mark.parametrize("max_num_seqs", [2],
+                         ids=lambda val: f"max_num_seqs({val})")
+@pytest.mark.parametrize("max_model_len", [256],
+                         ids=lambda val: f"max_model_len({val})")
+def test_openai_serving(
+    remote_openai_server,
+    model,
+    warmup_shape,
+    backend,
+    tp_size,
+    cb,
+    max_num_seqs,
+    max_model_len,
+):
     """Test online serving using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
@@ -35,6 +55,9 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
     assert len(completion.choices) == 2
     assert len(completion.choices[0].text) > 0
 
+    # rest are SB tests
+    if cb:
+        return
     # Check some basic error handling as well. This is all done in one test
     # now to avoid server boot-up overhead to test each case.
     # To change this we'll need:
@@ -55,83 +78,3 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
                                                max_tokens=25)
     except openai.BadRequestError as e:
         assert "warmup" in str(e)
-
-
-@pytest.mark.skip(reason="Test disabled until a model is available")
-@pytest.mark.parametrize("model", get_spyre_model_list(quantized="gptq"))
-@pytest.mark.parametrize("backend", ["sendnn"])
-@pytest.mark.parametrize("quantization", ["gptq"])
-@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
-def test_openai_serving_gptq(remote_openai_server, model, backend,
-                             warmup_shape, quantization):
-    """Test online serving a GPTQ model with the sendnn backend only"""
-
-    client = remote_openai_server.get_client()
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=0.0)
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) > 0
-
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=1.0,
-                                           n=2)
-    assert len(completion.choices) == 2
-    assert len(completion.choices[0].text) > 0
-
-
-@pytest.mark.parametrize("model", get_spyre_model_list(quantized="fp8"))
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
-def test_openai_serving_fp8(remote_openai_server, model, backend,
-                            warmup_shape):
-    """Test online serving an FP8 model"""
-
-    client = remote_openai_server.get_client()
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=0.0)
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) > 0
-
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=1.0,
-                                           n=2)
-    assert len(completion.choices) == 2
-    assert len(completion.choices[0].text) > 0
-
-
-@pytest.mark.basic
-@pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize("cb",
-                         [pytest.param(1, marks=pytest.mark.cb, id="cb")])
-@pytest.mark.parametrize("max_num_seqs", [2],
-                         ids=lambda val: f"max_num_seqs({val})")
-@pytest.mark.parametrize("max_model_len", [256],
-                         ids=lambda val: f"max_model_len({val})")
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
-def test_openai_serving_cb(remote_openai_server, model, backend, cb,
-                           max_num_seqs, max_model_len):
-    """Test online serving with CB using the `vllm serve` CLI"""
-
-    client = remote_openai_server.get_client()
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=0.0)
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) > 0
-
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=1.0,
-                                           n=2)
-    assert len(completion.choices) == 2
-    assert len(completion.choices[0].text) > 0
@@ -36,6 +36,8 @@ def test_prompt_logprobs(
     implementation using huggingface.
     '''
     skip_unsupported_tp_size(tp_size, backend)
+    if "FP8" in model:
+        pytest.skip(reason="Prompt logprobs does not support FP8")
     num_prompt_logprobs = 5
 
     prompts = get_chicken_soup_prompts(4)

@@ -24,7 +24,8 @@
 
 DISABLE_ASSERTS = False  # used for debugging
 
-ISCLOSE_REL_TOL_CPU = 0.2
+# TODO: Needs to be separate for quantized models
+ISCLOSE_REL_TOL_CPU = 0.35
 ISCLOSE_REL_TOL_SPYRE = 0.35
 
 
@@ -513,36 +514,14 @@ def get_spyre_backend_list():
 # get model names from env, if not set then use default models for each type.
 # Multiple models can be specified with a comma separated list in
 # VLLM_SPYRE_TEST_MODEL_LIST
-def get_spyre_model_list(isEmbeddings=False, quantized=None):
-    spyre_model_dir_path = get_spyre_model_dir_path()
-
-    def _get_or_default(env: str, default: str) -> str:
-        """Handle empty strings in env var"""
-        val = os.environ.get(env, default)
-        if not val:
-            val = default
-        return val
+def get_spyre_model_list(isEmbeddings=False):
+    user_test_model_list = os.environ.get("VLLM_SPYRE_TEST_MODEL_LIST")
+    if not user_test_model_list:
+        return _default_test_models(isEmbeddings)
 
-    if isEmbeddings:
-        user_test_model_list = _get_or_default(
-            "VLLM_SPYRE_TEST_MODEL_LIST",
-            "sentence-transformers/all-roberta-large-v1")
-        marks = [pytest.mark.embedding]
-    elif quantized == "gptq":
-        # TODO: need a HF hub reference here as a default
-        user_test_model_list = _get_or_default("VLLM_SPYRE_TEST_MODEL_LIST",
-                                               "granite-3.0-8b-instruct-gptq")
-        marks = [pytest.mark.decoder, pytest.mark.quantized, pytest.mark.spyre]
-    elif quantized == "fp8":
-        user_test_model_list = _get_or_default(
-            "VLLM_SPYRE_TEST_MODEL_LIST",
-            "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8")
-        marks = [pytest.mark.decoder, pytest.mark.quantized]
-    else:
-        user_test_model_list = _get_or_default(
-            "VLLM_SPYRE_TEST_MODEL_LIST",
-            "ibm-ai-platform/micro-g3.3-8b-instruct-1b")
-        marks = [pytest.mark.decoder]
+    # User overridden model list
+    spyre_model_dir_path = get_spyre_model_dir_path()
+    marks = [pytest.mark.embedding] if isEmbeddings else [pytest.mark.decoder]
 
     test_model_list = []
     for model in user_test_model_list.split(","):
@@ -552,6 +531,27 @@ def _get_or_default(env: str, default: str) -> str:
     return test_model_list
 
 
+def _default_test_models(isEmbeddings=False):
+    """Return the default set of test models as pytest parameterizations"""
+    if isEmbeddings:
+        model = "sentence-transformers/all-roberta-large-v1"
+        return [pytest.param(model, marks=[pytest.mark.embedding], id=model)]
+
+    # Decoders
+    # We run tests for both the full-precision bf16 and fp8-quantized models,
+    # but by default the `pytest.mark.quantized` marker is de-selected unless
+    # the test command includes `-m quantized`.
+    tinygranite = "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
+    tinygranite_fp8 = "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+    params = [
+        pytest.param(tinygranite, marks=[pytest.mark.decoder], id=tinygranite),
+        pytest.param(tinygranite_fp8,
+                     marks=[pytest.mark.decoder, pytest.mark.quantized],
+                     id=tinygranite_fp8)
+    ]
+    return params
+
+
 def create_text_prompt(model: str, min_tokens: int, max_tokens: int) -> str:
     """Create a text prompt for the specified model that will tokenize to within
     the specified token length range."""