Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions harness/baselines/check-qwen3.5-4b.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"captured_at": "2026-04-15T21:36:32",
"captured_at": "2026-04-16T16:34:02",
"rapid_mlx_version": "0.5.0",
"model": "qwen3.5-4b",
"metrics": {
"cold_ttft_ms": 313.62908403389156,
"cold_tps": 49.2736011322839,
"cached_ttft_ms": 393.99170805700123,
"decode_tps": 49.67459517661095,
"cold_ttft_ms": 170.7701669074595,
"cold_tps": 168.1036081769402,
"cached_ttft_ms": 223.16349996253848,
"decode_tps": 167.821708310493,
"decode_tps_stdev": 0,
"mt_ttft_ms": 402.93216705322266,
"mt_tps": 48.32873664326039,
"tc_latency_ms": 2819.6406660135835,
"mt_ttft_ms": 233.40062494389713,
"mt_tps": 165.54264366438625,
"tc_latency_ms": 1068.896499928087,
"tc_success_rate": 1.0,
"long_ttft_ms": 1274.2738330271095,
"long_tps": 48.325022009001415,
"long_cached_ttft_ms": 1233.1054580863565,
"composite_score": 109.9
"long_ttft_ms": 966.4679998531938,
"long_tps": 163.68971893952352,
"long_cached_ttft_ms": 943.7295419629663,
"composite_score": 257.2
}
}
42 changes: 2 additions & 40 deletions harness/scorecard/latest.md
Original file line number Diff line number Diff line change
@@ -1,45 +1,7 @@
# Rapid-MLX Benchmark Scorecard

_Generated: 2026-04-16T07:10:38_
_Generated: 2026-04-16T16:50:20_

| Model | Decode TPS | Cold TTFT | Cached TTFT | Tool % | Score | Status |
| --- | ---: | ---: | ---: | ---: | ---: | --- |
| deepseek-r1-32b | 8.6 | 1111ms | 418ms | 0% | 51.8 | OK |
| llama3-3b | 34.9 | 258ms | 189ms | 0% | 130.1 | OK |
| qwen3-vl-8b | 12.2 | 456ms | 505ms | 100% | 59.3 | OK |
| qwen3.5-27b | — | — | — | — | — | FAIL — server boot failed: server exited with code 1 before becoming healthy |
| qwen3.5-35b | 10.9 | 1091ms | 1063ms | 0% | 26.5 | OK |
| qwen3.5-4b | 25.1 | 448ms | 460ms | 100% | 78.5 | OK |
| qwen3.5-9b | 20.7 | 539ms | 563ms | 100% | 68.1 | OK |
| qwopus-27b | 8.8 | 1165ms | 1145ms | 100% | 41.9 | OK |
| qwopus-27b-8bit | — | — | — | — | — | FAIL — server boot failed: server exited with code 1 before becoming healthy |

## Skipped

- **deepseek-r1-8b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **devstral-24b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **devstral-v2-24b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **gemma-3n-e4b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **gemma-4-26b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **gemma-4-31b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **gemma3-12b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **gemma3-1b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **gemma3-27b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **glm4.5-air** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **glm4.7-9b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **gpt-oss-20b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **hermes3-8b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **hermes4-70b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **kimi-48b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **kimi-k2.5** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **minimax-m2.5** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **ministral-3b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **mistral-24b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **phi4-14b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **qwen3-coder** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **qwen3-coder-30b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **qwen3-vl-30b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **qwen3-vl-4b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **qwen3.5-122b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **qwen3.5-122b-8bit** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
- **qwopus-9b** — not found in HF_HUB_CACHE / ~/.cache/huggingface / ~/.lmstudio
| qwopus-27b-8bit | 22.4 | 508ms | 513ms | 0% | 50.5 | OK |
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ vllm = [
# Guided decoding with outlines for structured JSON output
guided = [
"outlines[mlxlm]>=1.0.0",
"outlines-core>=0.2.0", # FSM engine for tool call constrained decoding
]
# Audio dependencies for TTS/STT (mlx-audio)
audio = [
Expand Down
299 changes: 299 additions & 0 deletions tests/test_fsm_tool_call.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
# SPDX-License-Identifier: Apache-2.0
"""Tests for FSM-based tool call constrained decoding."""

from __future__ import annotations

import json
import time
from unittest.mock import MagicMock

import pytest

# Skip all tests if outlines-core not installed
pytest.importorskip("outlines_core")


class TestFSMToolCallCache:
"""Tests for FSM compilation cache."""

def test_precompile_success(self):
from vllm_mlx.api.fsm_tool_call import FSMToolCallCache

cache = FSMToolCallCache()
# Build vocabulary from real tokenizer
from outlines_core import Vocabulary

cache._vocabulary = Vocabulary.from_pretrained(
"mlx-community/Qwen3.5-4B-MLX-4bit"
)

tools = [
{
"function": {
"name": "get_weather",
"parameters": {"type": "object", "properties": {"location": {"type": "string"}}},
}
}
]
assert cache.precompile(tools) is True

def test_cache_hit(self):
from vllm_mlx.api.fsm_tool_call import FSMToolCallCache

cache = FSMToolCallCache()
from outlines_core import Vocabulary

cache._vocabulary = Vocabulary.from_pretrained(
"mlx-community/Qwen3.5-4B-MLX-4bit"
)

tools = [{"function": {"name": "search", "parameters": {"type": "object"}}}]

# First call compiles
t0 = time.perf_counter()
cache.precompile(tools)
first_time = time.perf_counter() - t0

# Second call hits cache
t0 = time.perf_counter()
result = cache.precompile(tools)
second_time = time.perf_counter() - t0

assert result is True
assert second_time < first_time / 10, "Cache hit should be >10x faster"

def test_get_guide_returns_fresh_guide(self):
from vllm_mlx.api.fsm_tool_call import FSMToolCallCache

cache = FSMToolCallCache()
from outlines_core import Vocabulary

cache._vocabulary = Vocabulary.from_pretrained(
"mlx-community/Qwen3.5-4B-MLX-4bit"
)

tools = [{"function": {"name": "test", "parameters": {"type": "object"}}}]

g1 = cache.get_guide(tools)
g2 = cache.get_guide(tools)
assert g1 is not None
assert g2 is not None
# Each guide is a fresh instance (independent state)
assert g1 is not g2

def test_schema_builds_correct_enum(self):
from vllm_mlx.api.fsm_tool_call import _build_tool_call_schema

tools = [
{"function": {"name": "get_weather"}},
{"function": {"name": "search"}},
{"function": {"name": "calculate"}},
]
schema = json.loads(_build_tool_call_schema(tools))
assert schema["properties"]["name"]["enum"] == [
"get_weather",
"search",
"calculate",
]
assert schema["required"] == ["name", "arguments"]


class TestFSMToolCallProcessor:
"""Tests for the two-mode logits processor."""

@pytest.fixture
def tokenizer(self):
from transformers import AutoTokenizer

return AutoTokenizer.from_pretrained("mlx-community/Qwen3.5-4B-MLX-4bit")

@pytest.fixture
def tools(self):
return [
{
"function": {
"name": "get_weather",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}},
},
}
}
]

@pytest.fixture
def processor(self, tokenizer, tools):
from outlines_core import Vocabulary

from vllm_mlx.api.fsm_tool_call import (
FSMToolCallCache,
FSMToolCallProcessor,
)

cache = FSMToolCallCache()
cache._vocabulary = Vocabulary.from_pretrained(
"mlx-community/Qwen3.5-4B-MLX-4bit"
)
cache.precompile(tools)

return FSMToolCallProcessor(
tokenizer=tokenizer,
tools=tools,
parser_name="hermes",
cache=cache,
)

def test_free_mode_passes_logits_through(self, processor, tokenizer):
"""In free mode, logits should pass through unchanged."""
import mlx.core as mx

logits = mx.random.normal((1, 248077))
token_ids = mx.array(tokenizer.encode("Hello world"))

result = processor(token_ids, logits)
# Should be identical (no masking)
assert mx.array_equal(result, logits)

def test_trigger_plus_json_activates_constrained_mode(self, processor, tokenizer):
"""After seeing <tool_call>\\n + '{', processor should constrain."""
import mlx.core as mx

# Feed trigger + JSON start
text = '<tool_call>\n{"'
token_ids = tokenizer.encode(text, add_special_tokens=False)
logits = mx.zeros((1, 248077))

for i, tid in enumerate(token_ids):
all_ids = mx.array(token_ids[: i + 1])
processor(all_ids, logits)

assert processor._constrained, "Should be in constrained mode after trigger + '{'"

def test_trigger_plus_xml_skips_fsm(self, processor, tokenizer):
"""After <tool_call>\\n + '<', FSM should NOT activate (XML format)."""
import mlx.core as mx

text = "<tool_call>\n<function"
token_ids = tokenizer.encode(text, add_special_tokens=False)
logits = mx.zeros((1, 248077))

for i, tid in enumerate(token_ids):
all_ids = mx.array(token_ids[: i + 1])
processor(all_ids, logits)

assert not processor._constrained, "Should NOT constrain for XML format"

def test_constrained_mode_masks_invalid_tokens(self, processor, tokenizer):
"""In constrained mode, most tokens should be masked to -inf."""
import mlx.core as mx

# Activate constrained mode by feeding trigger
processor._recent_text = "<tool_call>\n"
processor._constrained = False

# Create a dummy "last token was newline" to trigger
trigger_ids = tokenizer.encode("<tool_call>\n", add_special_tokens=False)
logits = mx.zeros((1, 248077))

# Feed the last trigger token to activate FSM
result = processor(mx.array(trigger_ids), logits)

if processor._constrained:
# Most tokens should be -inf (masked)
result_np = result.tolist()[0]
n_valid = sum(1 for x in result_np if x > -1e9)
n_masked = sum(1 for x in result_np if x < -1e9)
print(f"\n Constrained: {n_valid} valid, {n_masked} masked")
assert n_valid < 100, f"Expected < 100 valid tokens, got {n_valid}"
assert n_masked > 200000, "Expected most tokens masked"

def test_reset_clears_state(self, processor):
processor._constrained = True
processor._recent_text = "some text"
processor._guide = MagicMock()

processor.reset()

assert not processor._constrained
assert processor._recent_text == ""
assert processor._guide is None


class TestFSMFactory:
"""Tests for the factory function."""

def test_create_returns_processor_when_available(self):
from outlines_core import Vocabulary
from transformers import AutoTokenizer

from vllm_mlx.api.fsm_tool_call import create_fsm_processor, get_fsm_cache

tok = AutoTokenizer.from_pretrained("mlx-community/Qwen3.5-4B-MLX-4bit")
cache = get_fsm_cache()
cache._vocabulary = Vocabulary.from_pretrained(
"mlx-community/Qwen3.5-4B-MLX-4bit"
)

tools = [{"function": {"name": "test", "parameters": {"type": "object"}}}]
proc = create_fsm_processor("hermes", tok, tools)
assert proc is not None

def test_create_returns_processor_with_generic_schema(self):
"""Even without specific tools, factory returns a processor
with generic schema (any name + any arguments)."""
from vllm_mlx.api.fsm_tool_call import create_fsm_processor

tok = MagicMock()
# No tools → generic schema processor (not None)
proc = create_fsm_processor("hermes", tok, None)
assert proc is not None, "Should return generic FSM processor"

def test_all_parsers_have_triggers(self):
"""Every parser should have a trigger pattern registered."""
from vllm_mlx.api.fsm_tool_call import TOOL_CALL_TRIGGERS

expected_parsers = [
"hermes", "llama", "minimax", "qwen", "deepseek",
"glm47", "granite", "nemotron", "kimi", "gemma4",
"functionary", "seed_oss", "mistral", "xlam",
]
for p in expected_parsers:
assert p in TOOL_CALL_TRIGGERS, f"Missing trigger for parser {p!r}"


class TestFSMPerformance:
"""Verify FSM overhead is negligible."""

def test_per_token_overhead_under_10us(self):
"""FSM lookup must be < 10µs per token to not affect decode speed."""
from outlines_core import Guide, Index, Vocabulary, json_schema

vocabulary = Vocabulary.from_pretrained("mlx-community/Qwen3.5-4B-MLX-4bit")
schema = json.dumps({
"type": "object",
"properties": {
"name": {"type": "string", "enum": ["get_weather"]},
"arguments": {"type": "object"},
},
"required": ["name", "arguments"],
})
regex = json_schema.build_regex_from_schema(schema)
index = Index(regex, vocabulary)

from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("mlx-community/Qwen3.5-4B-MLX-4bit")
target = '{"name": "get_weather", "arguments": {}}'
target_ids = tok.encode(target, add_special_tokens=False)

# Benchmark
t0 = time.perf_counter()
for _ in range(1000):
guide = Guide(index)
for tid in target_ids:
guide.get_tokens()
guide.advance(tid)
dt = time.perf_counter() - t0
per_token_us = dt / (1000 * len(target_ids)) * 1e6

print(f"\n Per-token FSM overhead: {per_token_us:.1f} µs")
assert per_token_us < 10, f"FSM overhead too high: {per_token_us:.1f} µs"
Loading
Loading