vllm-project · joerunde · Aug 8, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
@@ -42,7 +42,7 @@ jobs:
             markers: "cpu and decoder and not cb"
             flags: "--timeout=300"
           - name: "quantized"
-            markers: "cpu and quantized"
+            markers: "cpu and decoder"
             flags: "--timeout=300"
           - name: "embedding"
             markers: "cpu and embedding"
@@ -179,5 +179,9 @@ jobs:
           # re-install the vllm_sypre package from source
           source .venv/bin/activate
 
+          if [ "${{ matrix.test_suite.name }}" == "quantized" ]; then
+            export VLLM_SPYRE_TEST_MODEL_LIST="ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+          fi
+
           python3 -m pytest ${{ matrix.test_suite.flags }} \
             tests -v -m "${{ matrix.test_suite.markers }}"
@@ -12,7 +12,7 @@ readme = "README.md"
 license = {text = "Apache 2"}
 dependencies = [
     "fms-model-optimizer>=0.2.0",
-    "ibm-fms==1.1.0",
+    "ibm-fms==1.2.0",
     "vllm>=0.9.2",
 ]
 requires-python = ">=3.9"

@@ -79,7 +79,18 @@ def remote_openai_server(request):
         raise pytest.UsageError(
             "Error setting up remote_openai_server params") from e
 
-    if 'cb' in params:
+    # Default to None if not present
+    quantization = params.get("quantization", None)
+
+    # Add extra server args if present in test
+    server_args = ["--quantization", quantization] if quantization else []
+
+    if 'tp_size' in params:
+        tp_size = params['tp_size']
+        skip_unsupported_tp_size(int(tp_size), backend)
+        server_args.extend(["--tensor-parallel-size", str(tp_size)])
+
+    if "cb" in params and params["cb"] == 1:
         max_model_len = params["max_model_len"]
         max_num_seqs = params["max_num_seqs"]
         env_dict = {
@@ -108,17 +119,6 @@ def remote_openai_server(request):
             backend,
         }
 
-        # Default to None if not present
-        quantization = params.get('quantization', None)
-
-        # Add extra server args if present in test
-        server_args = ["--quantization", quantization] if quantization else []
-
-        if 'tp_size' in params:
-            tp_size = params['tp_size']
-            skip_unsupported_tp_size(int(tp_size), backend)
-            server_args.extend(["--tensor-parallel-size", str(tp_size)])
-
     try:
         with RemoteOpenAIServer(model, server_args,
                                 env_dict=env_dict) as server:

@@ -4,19 +4,39 @@
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize("tp_size", [
-    pytest.param(1, marks=pytest.mark.basic),
-    pytest.param(2, marks=pytest.mark.multi),
-    pytest.param(4, marks=pytest.mark.multi),
-    pytest.param(8, marks=pytest.mark.multi),
-],
-                         ids=lambda val: f"TP({val})")
+@pytest.mark.parametrize(
+    "tp_size",
+    [
+        pytest.param(1, marks=pytest.mark.basic),
+        pytest.param(2, marks=pytest.mark.multi),
+        pytest.param(4, marks=pytest.mark.multi),
+        pytest.param(8, marks=pytest.mark.multi),
+    ],
+    ids=lambda val: f"TP({val})",
+)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("warmup_shape", [[
-    (64, 20, 1),
-]])
-def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
-                        tp_size):
+@pytest.mark.parametrize(
+    "warmup_shape",
+    [[
+        (64, 20, 1),
+    ]],
+)
+@pytest.mark.parametrize("cb",
+                         [pytest.param(1, marks=pytest.mark.cb, id="cb"), 0])
+@pytest.mark.parametrize("max_num_seqs", [2],
+                         ids=lambda val: f"max_num_seqs({val})")
+@pytest.mark.parametrize("max_model_len", [256],
+                         ids=lambda val: f"max_model_len({val})")
+def test_openai_serving(
+    remote_openai_server,
+    model,
+    warmup_shape,
+    backend,
+    tp_size,
+    cb,
+    max_num_seqs,
+    max_model_len,
+):
     """Test online serving using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
@@ -35,6 +55,9 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
     assert len(completion.choices) == 2
     assert len(completion.choices[0].text) > 0
 
+    # rest are SB tests
+    if cb:
+        return
     # Check some basic error handling as well. This is all done in one test
     # now to avoid server boot-up overhead to test each case.
     # To change this we'll need:
@@ -55,83 +78,3 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
                                                max_tokens=25)
     except openai.BadRequestError as e:
         assert "warmup" in str(e)
-
-
-@pytest.mark.skip(reason="Test disabled until a model is available")
-@pytest.mark.parametrize("model", get_spyre_model_list(quantized="gptq"))
-@pytest.mark.parametrize("backend", ["sendnn"])
-@pytest.mark.parametrize("quantization", ["gptq"])
-@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
-def test_openai_serving_gptq(remote_openai_server, model, backend,
-                             warmup_shape, quantization):
-    """Test online serving a GPTQ model with the sendnn backend only"""
-
-    client = remote_openai_server.get_client()
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=0.0)
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) > 0
-
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=1.0,
-                                           n=2)
-    assert len(completion.choices) == 2
-    assert len(completion.choices[0].text) > 0
-
-
-@pytest.mark.parametrize("model", get_spyre_model_list(quantized="fp8"))
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
-def test_openai_serving_fp8(remote_openai_server, model, backend,
-                            warmup_shape):
-    """Test online serving an FP8 model"""
-
-    client = remote_openai_server.get_client()
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=0.0)
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) > 0
-
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=1.0,
-                                           n=2)
-    assert len(completion.choices) == 2
-    assert len(completion.choices[0].text) > 0
-
-
-@pytest.mark.basic
-@pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize("cb",
-                         [pytest.param(1, marks=pytest.mark.cb, id="cb")])
-@pytest.mark.parametrize("max_num_seqs", [2],
-                         ids=lambda val: f"max_num_seqs({val})")
-@pytest.mark.parametrize("max_model_len", [256],
-                         ids=lambda val: f"max_model_len({val})")
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
-def test_openai_serving_cb(remote_openai_server, model, backend, cb,
-                           max_num_seqs, max_model_len):
-    """Test online serving with CB using the `vllm serve` CLI"""
-
-    client = remote_openai_server.get_client()
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=0.0)
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) > 0
-
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=1.0,
-                                           n=2)
-    assert len(completion.choices) == 2
-    assert len(completion.choices[0].text) > 0
@@ -513,7 +513,7 @@ def get_spyre_backend_list():
 # get model names from env, if not set then use default models for each type.
 # Multiple models can be specified with a comma separated list in
 # VLLM_SPYRE_TEST_MODEL_LIST
-def get_spyre_model_list(isEmbeddings=False, quantized=None):
+def get_spyre_model_list(isEmbeddings=False):
     spyre_model_dir_path = get_spyre_model_dir_path()
 
     def _get_or_default(env: str, default: str) -> str:
@@ -528,16 +528,6 @@ def _get_or_default(env: str, default: str) -> str:
             "VLLM_SPYRE_TEST_MODEL_LIST",
             "sentence-transformers/all-roberta-large-v1")
         marks = [pytest.mark.embedding]
-    elif quantized == "gptq":
-        # TODO: need a HF hub reference here as a default
-        user_test_model_list = _get_or_default("VLLM_SPYRE_TEST_MODEL_LIST",
-                                               "granite-3.0-8b-instruct-gptq")
-        marks = [pytest.mark.decoder, pytest.mark.quantized, pytest.mark.spyre]
-    elif quantized == "fp8":
-        user_test_model_list = _get_or_default(
-            "VLLM_SPYRE_TEST_MODEL_LIST",
-            "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8")
-        marks = [pytest.mark.decoder, pytest.mark.quantized]
     else:
         user_test_model_list = _get_or_default(
             "VLLM_SPYRE_TEST_MODEL_LIST",

@@ -171,29 +171,6 @@ def load_weights(
         **kwargs,
     ) -> None:
 
-        if model_config.quantization == "gptq":
-            if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn":
-                from fms_mo.aiu_addons.gptq import (  # noqa: F401
-                    gptq_aiu_adapter, gptq_aiu_linear)
-                linear_type = "gptq_aiu"
-                logger.info("Loaded `aiu_addons` functionalities")
-            else:
-                linear_type = "gptq_cpu"
-                logger.warning("GPTQ is not expected to work on CPU.")
-
-            quant_cfg = model_config._parse_quant_hf_config()
-
-            linear_config = {
-                "linear_type": linear_type,
-                "group_size": quant_cfg['group_size'],
-                "desc_act": quant_cfg['desc_act'],
-            }
-            self.dtype = None
-            model_source = "hf_gptq_aiu"
-        else:
-            linear_config = {"linear_type": "torch_linear"}
-            model_source = "hf"
-
         if self.dtype is not model_config.dtype:
             logger.info(
                 "Ignoring user-provided dtype=%s and using dtype=%s instead.",
@@ -212,15 +189,17 @@ def load_weights(
         # we can use fused weights unless running on Spyre
         fused_weights = envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn"
 
-        self.model = get_model(architecture="hf_configured",
-                               variant=model_config.model,
-                               model_path=model_path,
-                               source=model_source,
-                               data_type=self.dtype,
-                               distributed_strategy=distributed_strategy,
-                               group=dist.group.WORLD,
-                               fused_weights=fused_weights,
-                               linear_config=linear_config)
+        self.model = get_model(
+            architecture="hf_configured",
+            variant=model_config.model,
+            model_path=model_path,
+            source="hf",
+            data_type=self.dtype,
+            distributed_strategy=distributed_strategy,
+            group=dist.group.WORLD,
+            fused_weights=fused_weights,
+            linear_config={"linear_type": "torch_linear"},
+        )
 
         self.model.eval()
         torch.set_grad_enabled(False)