From 757af7ce14a4d24ac20556d6248e546c17d6a268 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Tue, 25 Nov 2025 12:29:24 +0800 Subject: [PATCH 01/18] FEAT: add llama_cpp json schema output --- xinference/model/llm/llama_cpp/core.py | 39 ++++++++++++++++++++++++-- xinference/model/llm/utils.py | 26 +++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index 3f06d2f899..e87f10bb00 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -16,7 +16,7 @@ import os import pprint import queue -from typing import Iterator, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from packaging import version @@ -25,11 +25,42 @@ from ...utils import check_dependency_available from ..core import LLM, chat_context_var from ..llm_family import LLMFamilyV2, LLMSpecV1 -from ..utils import ChatModelMixin +from ..utils import ChatModelMixin, normalize_response_format logger = logging.getLogger(__name__) +def _schema_to_gbnf(schema: Dict[str, Any]) -> Optional[str]: + try: + from llama_cpp.llama_grammar import ( + json_schema_to_gbnf, # type: ignore[attr-defined] + ) + except Exception as e: # pragma: no cover - optional dependency + logger.warning( + "json_schema provided but llama_cpp grammar support missing: %s", e + ) + return None + try: + return json_schema_to_gbnf(schema) + except Exception as e: # pragma: no cover - conversion failure + logger.warning("Failed to convert json_schema to GBNF for llama.cpp: %s", e) + return None + + +def _apply_response_format(generate_config: Dict[str, Any]) -> None: + response_format = generate_config.pop("response_format", None) + normalized = normalize_response_format(response_format) + if not normalized or normalized.get("type") != "json_schema": + return + schema_dict = normalized.get("schema_dict") + if not schema_dict: + return + generate_config.setdefault("json_schema", schema_dict) + grammar = _schema_to_gbnf(schema_dict) + if grammar: + generate_config.setdefault("grammar", grammar) + + class _Done: pass @@ -49,7 +80,7 @@ def __init__( model_path: str, llamacpp_model_config: Optional[dict] = None, ): - super().__init__(model_uid, model_family, model_path) + super().__init__(model_uid, model_family, model_path) # type: ignore[call-arg] self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config) self._llm = None self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None @@ -246,6 +277,7 @@ def generate( generate_config = generate_config or {} if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS: generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS + _apply_response_format(generate_config) stream = generate_config.get("stream", False) q: queue.Queue = queue.Queue() @@ -305,6 +337,7 @@ def chat( generate_config = generate_config or {} if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS: generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS + _apply_response_format(generate_config) stream = generate_config.get("stream", False) chat_template_kwargs = ( diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index 420598dbba..2fc54d9d3a 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -1235,6 +1235,32 @@ def get_stop_token_ids_from_config_file(model_path: str) -> Optional[List[int]]: return None +def normalize_response_format( + response_format: Optional[Dict[str, Any]], +) -> Optional[Dict[str, Any]]: + """ + Normalize OpenAI-style response_format into a simple dict. + Returns: + None if missing/unsupported, or a dict with keys: + - type: "json_schema" | "json_object" + - schema_dict: dict (only for json_schema) + """ + if not response_format or not isinstance(response_format, dict): + return None + + fmt_type = response_format.get("type") + if fmt_type not in ("json_schema", "json_object"): + return None + + normalized: Dict[str, Any] = {"type": fmt_type} + if fmt_type == "json_schema": + schema_block = response_format.get("json_schema") or {} + schema_dict = schema_block.get("schema_") or schema_block.get("schema") + if schema_dict: + normalized["schema_dict"] = schema_dict + return normalized + + def parse_messages(messages: List[Dict]) -> Tuple: """ Some older models still follow the old way of parameter passing. From 0501936a7d2d0647be51e644b9458b250a2f1069 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Thu, 27 Nov 2025 11:26:50 +0800 Subject: [PATCH 02/18] modify llama_cpp_python to xllamacpp --- xinference/model/llm/llama_cpp/core.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index e87f10bb00..33244d4ab9 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -30,20 +30,16 @@ logger = logging.getLogger(__name__) -def _schema_to_gbnf(schema: Dict[str, Any]) -> Optional[str]: +def _schema_to_grammar(schema: Dict[str, Any]) -> Optional[str]: try: - from llama_cpp.llama_grammar import ( - json_schema_to_gbnf, # type: ignore[attr-defined] - ) + import xllamacpp except Exception as e: # pragma: no cover - optional dependency - logger.warning( - "json_schema provided but llama_cpp grammar support missing: %s", e - ) + logger.warning("json_schema provided but xllamacpp missing: %s", e) return None try: - return json_schema_to_gbnf(schema) + return xllamacpp.json_schema_to_grammar(schema) # type: ignore[attr-defined] except Exception as e: # pragma: no cover - conversion failure - logger.warning("Failed to convert json_schema to GBNF for llama.cpp: %s", e) + logger.warning("Failed to convert json_schema to grammar for xllamacpp: %s", e) return None @@ -56,7 +52,7 @@ def _apply_response_format(generate_config: Dict[str, Any]) -> None: if not schema_dict: return generate_config.setdefault("json_schema", schema_dict) - grammar = _schema_to_gbnf(schema_dict) + grammar = _schema_to_grammar(schema_dict) if grammar: generate_config.setdefault("grammar", grammar) From 0c0eead87fbeb96d823553235d53933405167aed Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Thu, 27 Nov 2025 14:56:55 +0800 Subject: [PATCH 03/18] add a test python --- .../llm/llama_cpp/tests/test_structured.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 xinference/model/llm/llama_cpp/tests/test_structured.py diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py new file mode 100644 index 0000000000..ed64b3a84b --- /dev/null +++ b/xinference/model/llm/llama_cpp/tests/test_structured.py @@ -0,0 +1,66 @@ +import sys +from types import SimpleNamespace + + +def test_apply_response_format_sets_grammar(monkeypatch): + from xinference.model.llm.llama_cpp.core import _apply_response_format + + fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=lambda schema: "GRAMMAR") + monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp) + + cfg = { + "response_format": { + "type": "json_schema", + "json_schema": { + "schema": { + "type": "object", + "properties": {"a": {"type": "string"}}, + "required": ["a"], + } + }, + } + } + + _apply_response_format(cfg) + + assert "response_format" not in cfg + assert cfg["json_schema"]["required"] == ["a"] + assert cfg["grammar"] == "GRAMMAR" + + +def test_apply_response_format_handles_conversion_failure(monkeypatch): + from xinference.model.llm.llama_cpp.core import _apply_response_format + + def _raise(_): + raise ValueError("bad schema") + + fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=_raise) + monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp) + + cfg = { + "response_format": { + "type": "json_schema", + "json_schema": { + "schema": { + "type": "object", + "properties": {"b": {"type": "string"}}, + "required": ["b"], + } + }, + } + } + + _apply_response_format(cfg) + + assert "response_format" not in cfg + assert cfg["json_schema"]["required"] == ["b"] + assert "grammar" not in cfg + + +def test_apply_response_format_ignores_non_schema(monkeypatch): + from xinference.model.llm.llama_cpp.core import _apply_response_format + + cfg = {"response_format": {"type": "json_object"}} + _apply_response_format(cfg) + assert "grammar" not in cfg + assert "json_schema" not in cfg From 288c99ae7c5c27fd92c3c1abcc21ff9dad03aa30 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 1 Dec 2025 10:56:09 +0800 Subject: [PATCH 04/18] add new xllamacpp unit test --- .../llm/llama_cpp/tests/test_structured.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py index ed64b3a84b..6a1e5367c4 100644 --- a/xinference/model/llm/llama_cpp/tests/test_structured.py +++ b/xinference/model/llm/llama_cpp/tests/test_structured.py @@ -64,3 +64,40 @@ def test_apply_response_format_ignores_non_schema(monkeypatch): _apply_response_format(cfg) assert "grammar" not in cfg assert "json_schema" not in cfg + + +def test_apply_response_format_uses_real_xllamacpp_if_available(): + import importlib.util + + import pytest + + if importlib.util.find_spec("xllamacpp") is None: + pytest.skip("xllamacpp not installed") + + import importlib + + xllamacpp = importlib.import_module("xllamacpp") + if not hasattr(xllamacpp, "json_schema_to_grammar"): + pytest.skip("xllamacpp does not expose json_schema_to_grammar") + + from xinference.model.llm.llama_cpp.core import _apply_response_format + + cfg = { + "response_format": { + "type": "json_schema", + "json_schema": { + "schema": { + "type": "object", + "properties": {"c": {"type": "integer"}}, + "required": ["c"], + } + }, + } + } + + _apply_response_format(cfg) + + assert "response_format" not in cfg + # Real xllamacpp should attach grammar alongside json_schema + assert "json_schema" in cfg + assert "grammar" in cfg and cfg["grammar"] From 1dbc12df45940e30c00164a57417fec326753606 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 15:56:06 +0800 Subject: [PATCH 05/18] test CI thread error --- .github/workflows/python.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index c8a0728e0e..b1f8ed3d25 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -135,7 +135,7 @@ jobs: sudo rm -rf "$AGENT_TOOLSDIRECTORY" fi pip install -e ".[dev]" - pip install "xllamacpp>=0.2.0" + pip install "xllamacpp==0.2.5" if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From 464ca84738f4738d204af4d9f148b59396d3c235 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 16:22:29 +0800 Subject: [PATCH 06/18] test CI thread error --- .github/workflows/python.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index b1f8ed3d25..574e5c122a 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -135,7 +135,8 @@ jobs: sudo rm -rf "$AGENT_TOOLSDIRECTORY" fi pip install -e ".[dev]" - pip install "xllamacpp==0.2.5" + CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" \ + pip install --no-binary xllamacpp xllamacpp>=0.2.6 if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From 4739b39f4a30abe622c3c1c1cfce146c6f404c15 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 16:32:31 +0800 Subject: [PATCH 07/18] test CI thread error --- .github/workflows/python.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 574e5c122a..e4ae21c3ef 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -133,10 +133,12 @@ jobs: sudo rm -rf /opt/ghc sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo apt-get update + sudo apt-get install -y build-essential cmake ninja-build fi pip install -e ".[dev]" - CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" \ - pip install --no-binary xllamacpp xllamacpp>=0.2.6 + CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_FMA=ON -DLLAMA_F16C=ON" \ + pip install --no-binary xllamacpp xllamacpp>=0.2.0 if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From 7ad1c71b2ce62abbee9cf46c7e37c00edd404006 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 16:53:00 +0800 Subject: [PATCH 08/18] test CI thread error --- .github/workflows/python.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index e4ae21c3ef..45ebf4c708 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -136,8 +136,10 @@ jobs: sudo apt-get update sudo apt-get install -y build-essential cmake ninja-build fi + pip install -U pip setuptools wheel scikit-build-core ninja cmake pip install -e ".[dev]" - CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_FMA=ON -DLLAMA_F16C=ON" \ + # Build xllamacpp 0.2.6 from source with safe CPU flags (no AVX512/native) to avoid SIGILL + CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \ pip install --no-binary xllamacpp xllamacpp>=0.2.0 if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" From de7c64e268e521dac7d85b3641ed61d19973fd62 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 17:04:37 +0800 Subject: [PATCH 09/18] test CI thread error --- .github/workflows/python.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 45ebf4c708..49ecbf1fca 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -138,9 +138,8 @@ jobs: fi pip install -U pip setuptools wheel scikit-build-core ninja cmake pip install -e ".[dev]" - # Build xllamacpp 0.2.6 from source with safe CPU flags (no AVX512/native) to avoid SIGILL CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \ - pip install --no-binary xllamacpp xllamacpp>=0.2.0 + pip install --no-binary xllamacpp "xllamacpp>=0.2.0" if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From 1762822889be8c3dbd5ca9a97ceeadae0e71317e Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 17:11:39 +0800 Subject: [PATCH 10/18] test CI thread error --- .github/workflows/python.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 49ecbf1fca..ed1e370c6c 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -138,8 +138,7 @@ jobs: fi pip install -U pip setuptools wheel scikit-build-core ninja cmake pip install -e ".[dev]" - CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \ - pip install --no-binary xllamacpp "xllamacpp>=0.2.0" + pip install -e xllamacpp>=0.2.0 if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From 6314f43cbd338db246f437ef2f307782195de47a Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 17:20:49 +0800 Subject: [PATCH 11/18] test CI thread error --- .github/workflows/python.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index ed1e370c6c..869817e7a6 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -138,7 +138,13 @@ jobs: fi pip install -U pip setuptools wheel scikit-build-core ninja cmake pip install -e ".[dev]" - pip install -e xllamacpp>=0.2.0 + # Linux builds from source with safe CPU flags; other OS use prebuilt wheels + if [ "$OS" == "ubuntu-latest" ]; then + CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \ + pip install --no-binary xllamacpp "xllamacpp>=0.2.0" + else + pip install "xllamacpp>=0.2.0" + fi if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From e3f4f0f3a6b4511ec83b46579bbd201a457bb646 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 17:32:05 +0800 Subject: [PATCH 12/18] test CI thread error --- .github/workflows/python.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 869817e7a6..ed1e370c6c 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -138,13 +138,7 @@ jobs: fi pip install -U pip setuptools wheel scikit-build-core ninja cmake pip install -e ".[dev]" - # Linux builds from source with safe CPU flags; other OS use prebuilt wheels - if [ "$OS" == "ubuntu-latest" ]; then - CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \ - pip install --no-binary xllamacpp "xllamacpp>=0.2.0" - else - pip install "xllamacpp>=0.2.0" - fi + pip install -e xllamacpp>=0.2.0 if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From 3bd495ad7c7d2f1e91607bf8e5773e457e716fb0 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 8 Dec 2025 17:37:32 +0800 Subject: [PATCH 13/18] test CI thread error --- .github/workflows/python.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index ed1e370c6c..c8a0728e0e 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -133,12 +133,9 @@ jobs: sudo rm -rf /opt/ghc sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo apt-get update - sudo apt-get install -y build-essential cmake ninja-build fi - pip install -U pip setuptools wheel scikit-build-core ninja cmake pip install -e ".[dev]" - pip install -e xllamacpp>=0.2.0 + pip install "xllamacpp>=0.2.0" if [ "$MODULE" == "metal" ]; then conda install -c conda-forge "ffmpeg<7" pip install "mlx>=0.22.0" From 3cae81e854b7d8896b1ea165b3f8e5935324986c Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Thu, 11 Dec 2025 15:06:16 +0800 Subject: [PATCH 14/18] add real test for llama cpp json schema --- .../llm/llama_cpp/tests/test_structured.py | 122 +++++++++++++++--- 1 file changed, 107 insertions(+), 15 deletions(-) diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py index 6a1e5367c4..d1b50b82ee 100644 --- a/xinference/model/llm/llama_cpp/tests/test_structured.py +++ b/xinference/model/llm/llama_cpp/tests/test_structured.py @@ -1,10 +1,63 @@ +import importlib +import importlib.util +import json import sys +from enum import Enum from types import SimpleNamespace +import pytest +from pydantic import BaseModel + +from ..core import _apply_response_format + + +class CarType(str, Enum): + sedan = "sedan" + suv = "SuV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +def _load_json_from_message(message) -> dict: + def _strip_think(text: str) -> str: + stripped = text.lstrip() + if stripped.startswith(""): + if "" in stripped: + stripped = stripped.split("", 1)[1] + else: + stripped = stripped.split("", 1)[1] + return stripped.lstrip() + + raw_content = message.content + if isinstance(raw_content, str): + return json.loads(_strip_think(raw_content)) + + if isinstance(raw_content, list): + text_blocks = [] + for block in raw_content: + if isinstance(block, dict): + if block.get("type") == "text" and "text" in block: + text_blocks.append(_strip_think(block["text"])) + continue + + block_type = getattr(block, "type", None) + block_text = getattr(block, "text", None) + if block_type == "text" and block_text: + text_blocks.append(_strip_think(block_text)) + + if text_blocks: + return json.loads("".join(text_blocks)) + + pytest.fail(f"Unexpected message content format: {raw_content!r}") -def test_apply_response_format_sets_grammar(monkeypatch): - from xinference.model.llm.llama_cpp.core import _apply_response_format +def test_apply_response_format_sets_grammar(monkeypatch): fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=lambda schema: "GRAMMAR") monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp) @@ -29,8 +82,6 @@ def test_apply_response_format_sets_grammar(monkeypatch): def test_apply_response_format_handles_conversion_failure(monkeypatch): - from xinference.model.llm.llama_cpp.core import _apply_response_format - def _raise(_): raise ValueError("bad schema") @@ -58,8 +109,6 @@ def _raise(_): def test_apply_response_format_ignores_non_schema(monkeypatch): - from xinference.model.llm.llama_cpp.core import _apply_response_format - cfg = {"response_format": {"type": "json_object"}} _apply_response_format(cfg) assert "grammar" not in cfg @@ -67,21 +116,12 @@ def test_apply_response_format_ignores_non_schema(monkeypatch): def test_apply_response_format_uses_real_xllamacpp_if_available(): - import importlib.util - - import pytest - if importlib.util.find_spec("xllamacpp") is None: pytest.skip("xllamacpp not installed") - - import importlib - xllamacpp = importlib.import_module("xllamacpp") if not hasattr(xllamacpp, "json_schema_to_grammar"): pytest.skip("xllamacpp does not expose json_schema_to_grammar") - from xinference.model.llm.llama_cpp.core import _apply_response_format - cfg = { "response_format": { "type": "json_schema", @@ -101,3 +141,55 @@ def test_apply_response_format_uses_real_xllamacpp_if_available(): # Real xllamacpp should attach grammar alongside json_schema assert "json_schema" in cfg assert "grammar" in cfg and cfg["grammar"] + + +def test_llamacpp_qwen3_json_schema(setup): + pytest.importorskip( + "llama_cpp", reason="llama_cpp backend is required for this test" + ) + import openai + + from xinference.client import Client + + endpoint, _ = setup + client = Client(endpoint) + model_uid = client.launch_model( + model_name="qwen3", + model_engine="llama.cpp", + model_size_in_billions="0_6", + model_format="ggufv2", + quantization="Q4_K_M", + n_gpu=None, + ) + + try: + api_client = openai.Client(api_key="not empty", base_url=f"{endpoint}/v1") + completion = api_client.chat.completions.create( + model=model_uid, + messages=[ + { + "role": "user", + "content": ( + "Generate a JSON containing the brand, model, and car_type of" + " an iconic 90s car." + ), + } + ], + temperature=0, + max_tokens=128, + response_format={ + "type": "json_schema", + "json_schema": { + "name": "car-description", + "schema": CarDescription.model_json_schema(), + }, + }, + ) + + parsed = _load_json_from_message(completion.choices[0].message) + car_description = CarDescription.model_validate(parsed) + assert car_description.brand + assert car_description.model + finally: + if model_uid is not None: + client.terminate_model(model_uid) From 7cb457bdc1db69c157c71fd8643e1c0d1d5f31b1 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Thu, 11 Dec 2025 15:13:17 +0800 Subject: [PATCH 15/18] add real test for llama cpp json schema --- xinference/model/llm/llama_cpp/tests/test_structured.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py index d1b50b82ee..1f6b54d06a 100644 --- a/xinference/model/llm/llama_cpp/tests/test_structured.py +++ b/xinference/model/llm/llama_cpp/tests/test_structured.py @@ -4,6 +4,7 @@ import sys from enum import Enum from types import SimpleNamespace +from typing import Any, Dict import pytest from pydantic import BaseModel @@ -24,7 +25,7 @@ class CarDescription(BaseModel): car_type: CarType -def _load_json_from_message(message) -> dict: +def _load_json_from_message(message: Any) -> Dict[str, Any]: def _strip_think(text: str) -> str: stripped = text.lstrip() if stripped.startswith(""): @@ -55,6 +56,7 @@ def _strip_think(text: str) -> str: return json.loads("".join(text_blocks)) pytest.fail(f"Unexpected message content format: {raw_content!r}") + raise AssertionError("Unreachable") def test_apply_response_format_sets_grammar(monkeypatch): From fb244c54a89621315bcda30d8957f22cff243b1a Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Fri, 12 Dec 2025 10:49:57 +0800 Subject: [PATCH 16/18] fix test --- xinference/model/llm/llama_cpp/tests/test_structured.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py index 1f6b54d06a..c8fd45739a 100644 --- a/xinference/model/llm/llama_cpp/tests/test_structured.py +++ b/xinference/model/llm/llama_cpp/tests/test_structured.py @@ -6,9 +6,12 @@ from types import SimpleNamespace from typing import Any, Dict +import openai import pytest from pydantic import BaseModel +from xinference.client import Client + from ..core import _apply_response_format @@ -146,12 +149,6 @@ def test_apply_response_format_uses_real_xllamacpp_if_available(): def test_llamacpp_qwen3_json_schema(setup): - pytest.importorskip( - "llama_cpp", reason="llama_cpp backend is required for this test" - ) - import openai - - from xinference.client import Client endpoint, _ = setup client = Client(endpoint) From 3642ea08b73a0bab0507962079d0954992237ccd Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Fri, 12 Dec 2025 10:50:28 +0800 Subject: [PATCH 17/18] fix test --- xinference/model/llm/llama_cpp/tests/test_structured.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py index c8fd45739a..94c15fd059 100644 --- a/xinference/model/llm/llama_cpp/tests/test_structured.py +++ b/xinference/model/llm/llama_cpp/tests/test_structured.py @@ -149,7 +149,6 @@ def test_apply_response_format_uses_real_xllamacpp_if_available(): def test_llamacpp_qwen3_json_schema(setup): - endpoint, _ = setup client = Client(endpoint) model_uid = client.launch_model( From 268b227eeff12492074f8d844fa690372e294e82 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Fri, 12 Dec 2025 11:40:21 +0800 Subject: [PATCH 18/18] fix ci error --- xinference/model/llm/llama_cpp/core.py | 7 +++++-- xinference/model/llm/llama_cpp/tests/test_structured.py | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py index 33244d4ab9..3e619f0e42 100644 --- a/xinference/model/llm/llama_cpp/core.py +++ b/xinference/model/llm/llama_cpp/core.py @@ -51,10 +51,13 @@ def _apply_response_format(generate_config: Dict[str, Any]) -> None: schema_dict = normalized.get("schema_dict") if not schema_dict: return - generate_config.setdefault("json_schema", schema_dict) grammar = _schema_to_grammar(schema_dict) if grammar: - generate_config.setdefault("grammar", grammar) + # xllamacpp rejects configs containing both json_schema and grammar + generate_config.pop("json_schema", None) + generate_config["grammar"] = grammar + else: + generate_config.setdefault("json_schema", schema_dict) class _Done: diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py index 94c15fd059..1f2c5bc499 100644 --- a/xinference/model/llm/llama_cpp/tests/test_structured.py +++ b/xinference/model/llm/llama_cpp/tests/test_structured.py @@ -82,7 +82,7 @@ def test_apply_response_format_sets_grammar(monkeypatch): _apply_response_format(cfg) assert "response_format" not in cfg - assert cfg["json_schema"]["required"] == ["a"] + assert "json_schema" not in cfg assert cfg["grammar"] == "GRAMMAR" @@ -143,8 +143,8 @@ def test_apply_response_format_uses_real_xllamacpp_if_available(): _apply_response_format(cfg) assert "response_format" not in cfg - # Real xllamacpp should attach grammar alongside json_schema - assert "json_schema" in cfg + # Real xllamacpp should prefer grammar to avoid passing both + assert "json_schema" not in cfg assert "grammar" in cfg and cfg["grammar"]