From 757af7ce14a4d24ac20556d6248e546c17d6a268 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 25 Nov 2025 12:29:24 +0800
Subject: [PATCH 01/18] FEAT: add llama_cpp json schema output
---
xinference/model/llm/llama_cpp/core.py | 39 ++++++++++++++++++++++++--
xinference/model/llm/utils.py | 26 +++++++++++++++++
2 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 3f06d2f899..e87f10bb00 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -16,7 +16,7 @@
import os
import pprint
import queue
-from typing import Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from packaging import version
@@ -25,11 +25,42 @@
from ...utils import check_dependency_available
from ..core import LLM, chat_context_var
from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..utils import ChatModelMixin
+from ..utils import ChatModelMixin, normalize_response_format
logger = logging.getLogger(__name__)
+def _schema_to_gbnf(schema: Dict[str, Any]) -> Optional[str]:
+ try:
+ from llama_cpp.llama_grammar import (
+ json_schema_to_gbnf, # type: ignore[attr-defined]
+ )
+ except Exception as e: # pragma: no cover - optional dependency
+ logger.warning(
+ "json_schema provided but llama_cpp grammar support missing: %s", e
+ )
+ return None
+ try:
+ return json_schema_to_gbnf(schema)
+ except Exception as e: # pragma: no cover - conversion failure
+ logger.warning("Failed to convert json_schema to GBNF for llama.cpp: %s", e)
+ return None
+
+
+def _apply_response_format(generate_config: Dict[str, Any]) -> None:
+ response_format = generate_config.pop("response_format", None)
+ normalized = normalize_response_format(response_format)
+ if not normalized or normalized.get("type") != "json_schema":
+ return
+ schema_dict = normalized.get("schema_dict")
+ if not schema_dict:
+ return
+ generate_config.setdefault("json_schema", schema_dict)
+ grammar = _schema_to_gbnf(schema_dict)
+ if grammar:
+ generate_config.setdefault("grammar", grammar)
+
+
class _Done:
pass
@@ -49,7 +80,7 @@ def __init__(
model_path: str,
llamacpp_model_config: Optional[dict] = None,
):
- super().__init__(model_uid, model_family, model_path)
+ super().__init__(model_uid, model_family, model_path) # type: ignore[call-arg]
self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
self._llm = None
self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
@@ -246,6 +277,7 @@ def generate(
generate_config = generate_config or {}
if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
+ _apply_response_format(generate_config)
stream = generate_config.get("stream", False)
q: queue.Queue = queue.Queue()
@@ -305,6 +337,7 @@ def chat(
generate_config = generate_config or {}
if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
+ _apply_response_format(generate_config)
stream = generate_config.get("stream", False)
chat_template_kwargs = (
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 420598dbba..2fc54d9d3a 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -1235,6 +1235,32 @@ def get_stop_token_ids_from_config_file(model_path: str) -> Optional[List[int]]:
return None
+def normalize_response_format(
+ response_format: Optional[Dict[str, Any]],
+) -> Optional[Dict[str, Any]]:
+ """
+ Normalize OpenAI-style response_format into a simple dict.
+ Returns:
+ None if missing/unsupported, or a dict with keys:
+ - type: "json_schema" | "json_object"
+ - schema_dict: dict (only for json_schema)
+ """
+ if not response_format or not isinstance(response_format, dict):
+ return None
+
+ fmt_type = response_format.get("type")
+ if fmt_type not in ("json_schema", "json_object"):
+ return None
+
+ normalized: Dict[str, Any] = {"type": fmt_type}
+ if fmt_type == "json_schema":
+ schema_block = response_format.get("json_schema") or {}
+ schema_dict = schema_block.get("schema_") or schema_block.get("schema")
+ if schema_dict:
+ normalized["schema_dict"] = schema_dict
+ return normalized
+
+
def parse_messages(messages: List[Dict]) -> Tuple:
"""
Some older models still follow the old way of parameter passing.
From 0501936a7d2d0647be51e644b9458b250a2f1069 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 27 Nov 2025 11:26:50 +0800
Subject: [PATCH 02/18] modify llama_cpp_python to xllamacpp
---
xinference/model/llm/llama_cpp/core.py | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index e87f10bb00..33244d4ab9 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -30,20 +30,16 @@
logger = logging.getLogger(__name__)
-def _schema_to_gbnf(schema: Dict[str, Any]) -> Optional[str]:
+def _schema_to_grammar(schema: Dict[str, Any]) -> Optional[str]:
try:
- from llama_cpp.llama_grammar import (
- json_schema_to_gbnf, # type: ignore[attr-defined]
- )
+ import xllamacpp
except Exception as e: # pragma: no cover - optional dependency
- logger.warning(
- "json_schema provided but llama_cpp grammar support missing: %s", e
- )
+ logger.warning("json_schema provided but xllamacpp missing: %s", e)
return None
try:
- return json_schema_to_gbnf(schema)
+ return xllamacpp.json_schema_to_grammar(schema) # type: ignore[attr-defined]
except Exception as e: # pragma: no cover - conversion failure
- logger.warning("Failed to convert json_schema to GBNF for llama.cpp: %s", e)
+ logger.warning("Failed to convert json_schema to grammar for xllamacpp: %s", e)
return None
@@ -56,7 +52,7 @@ def _apply_response_format(generate_config: Dict[str, Any]) -> None:
if not schema_dict:
return
generate_config.setdefault("json_schema", schema_dict)
- grammar = _schema_to_gbnf(schema_dict)
+ grammar = _schema_to_grammar(schema_dict)
if grammar:
generate_config.setdefault("grammar", grammar)
From 0c0eead87fbeb96d823553235d53933405167aed Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 27 Nov 2025 14:56:55 +0800
Subject: [PATCH 03/18] add a test python
---
.../llm/llama_cpp/tests/test_structured.py | 66 +++++++++++++++++++
1 file changed, 66 insertions(+)
create mode 100644 xinference/model/llm/llama_cpp/tests/test_structured.py
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
new file mode 100644
index 0000000000..ed64b3a84b
--- /dev/null
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -0,0 +1,66 @@
+import sys
+from types import SimpleNamespace
+
+
+def test_apply_response_format_sets_grammar(monkeypatch):
+ from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+ fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=lambda schema: "GRAMMAR")
+ monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp)
+
+ cfg = {
+ "response_format": {
+ "type": "json_schema",
+ "json_schema": {
+ "schema": {
+ "type": "object",
+ "properties": {"a": {"type": "string"}},
+ "required": ["a"],
+ }
+ },
+ }
+ }
+
+ _apply_response_format(cfg)
+
+ assert "response_format" not in cfg
+ assert cfg["json_schema"]["required"] == ["a"]
+ assert cfg["grammar"] == "GRAMMAR"
+
+
+def test_apply_response_format_handles_conversion_failure(monkeypatch):
+ from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+ def _raise(_):
+ raise ValueError("bad schema")
+
+ fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=_raise)
+ monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp)
+
+ cfg = {
+ "response_format": {
+ "type": "json_schema",
+ "json_schema": {
+ "schema": {
+ "type": "object",
+ "properties": {"b": {"type": "string"}},
+ "required": ["b"],
+ }
+ },
+ }
+ }
+
+ _apply_response_format(cfg)
+
+ assert "response_format" not in cfg
+ assert cfg["json_schema"]["required"] == ["b"]
+ assert "grammar" not in cfg
+
+
+def test_apply_response_format_ignores_non_schema(monkeypatch):
+ from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+ cfg = {"response_format": {"type": "json_object"}}
+ _apply_response_format(cfg)
+ assert "grammar" not in cfg
+ assert "json_schema" not in cfg
From 288c99ae7c5c27fd92c3c1abcc21ff9dad03aa30 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 1 Dec 2025 10:56:09 +0800
Subject: [PATCH 04/18] add new xllamacpp unit test
---
.../llm/llama_cpp/tests/test_structured.py | 37 +++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index ed64b3a84b..6a1e5367c4 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -64,3 +64,40 @@ def test_apply_response_format_ignores_non_schema(monkeypatch):
_apply_response_format(cfg)
assert "grammar" not in cfg
assert "json_schema" not in cfg
+
+
+def test_apply_response_format_uses_real_xllamacpp_if_available():
+ import importlib.util
+
+ import pytest
+
+ if importlib.util.find_spec("xllamacpp") is None:
+ pytest.skip("xllamacpp not installed")
+
+ import importlib
+
+ xllamacpp = importlib.import_module("xllamacpp")
+ if not hasattr(xllamacpp, "json_schema_to_grammar"):
+ pytest.skip("xllamacpp does not expose json_schema_to_grammar")
+
+ from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+ cfg = {
+ "response_format": {
+ "type": "json_schema",
+ "json_schema": {
+ "schema": {
+ "type": "object",
+ "properties": {"c": {"type": "integer"}},
+ "required": ["c"],
+ }
+ },
+ }
+ }
+
+ _apply_response_format(cfg)
+
+ assert "response_format" not in cfg
+ # Real xllamacpp should attach grammar alongside json_schema
+ assert "json_schema" in cfg
+ assert "grammar" in cfg and cfg["grammar"]
From 1dbc12df45940e30c00164a57417fec326753606 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 15:56:06 +0800
Subject: [PATCH 05/18] test CI thread error
---
.github/workflows/python.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index c8a0728e0e..b1f8ed3d25 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -135,7 +135,7 @@ jobs:
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
fi
pip install -e ".[dev]"
- pip install "xllamacpp>=0.2.0"
+ pip install "xllamacpp==0.2.5"
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From 464ca84738f4738d204af4d9f148b59396d3c235 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 16:22:29 +0800
Subject: [PATCH 06/18] test CI thread error
---
.github/workflows/python.yaml | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index b1f8ed3d25..574e5c122a 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -135,7 +135,8 @@ jobs:
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
fi
pip install -e ".[dev]"
- pip install "xllamacpp==0.2.5"
+ CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" \
+ pip install --no-binary xllamacpp xllamacpp>=0.2.6
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From 4739b39f4a30abe622c3c1c1cfce146c6f404c15 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 16:32:31 +0800
Subject: [PATCH 07/18] test CI thread error
---
.github/workflows/python.yaml | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 574e5c122a..e4ae21c3ef 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -133,10 +133,12 @@ jobs:
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+ sudo apt-get update
+ sudo apt-get install -y build-essential cmake ninja-build
fi
pip install -e ".[dev]"
- CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" \
- pip install --no-binary xllamacpp xllamacpp>=0.2.6
+ CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_FMA=ON -DLLAMA_F16C=ON" \
+ pip install --no-binary xllamacpp xllamacpp>=0.2.0
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From 7ad1c71b2ce62abbee9cf46c7e37c00edd404006 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 16:53:00 +0800
Subject: [PATCH 08/18] test CI thread error
---
.github/workflows/python.yaml | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index e4ae21c3ef..45ebf4c708 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -136,8 +136,10 @@ jobs:
sudo apt-get update
sudo apt-get install -y build-essential cmake ninja-build
fi
+ pip install -U pip setuptools wheel scikit-build-core ninja cmake
pip install -e ".[dev]"
- CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_FMA=ON -DLLAMA_F16C=ON" \
+ # Build xllamacpp 0.2.6 from source with safe CPU flags (no AVX512/native) to avoid SIGILL
+ CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
pip install --no-binary xllamacpp xllamacpp>=0.2.0
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
From de7c64e268e521dac7d85b3641ed61d19973fd62 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:04:37 +0800
Subject: [PATCH 09/18] test CI thread error
---
.github/workflows/python.yaml | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 45ebf4c708..49ecbf1fca 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,9 +138,8 @@ jobs:
fi
pip install -U pip setuptools wheel scikit-build-core ninja cmake
pip install -e ".[dev]"
- # Build xllamacpp 0.2.6 from source with safe CPU flags (no AVX512/native) to avoid SIGILL
CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
- pip install --no-binary xllamacpp xllamacpp>=0.2.0
+ pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From 1762822889be8c3dbd5ca9a97ceeadae0e71317e Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:11:39 +0800
Subject: [PATCH 10/18] test CI thread error
---
.github/workflows/python.yaml | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 49ecbf1fca..ed1e370c6c 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,8 +138,7 @@ jobs:
fi
pip install -U pip setuptools wheel scikit-build-core ninja cmake
pip install -e ".[dev]"
- CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
- pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
+ pip install -e xllamacpp>=0.2.0
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From 6314f43cbd338db246f437ef2f307782195de47a Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:20:49 +0800
Subject: [PATCH 11/18] test CI thread error
---
.github/workflows/python.yaml | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index ed1e370c6c..869817e7a6 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,7 +138,13 @@ jobs:
fi
pip install -U pip setuptools wheel scikit-build-core ninja cmake
pip install -e ".[dev]"
- pip install -e xllamacpp>=0.2.0
+ # Linux builds from source with safe CPU flags; other OS use prebuilt wheels
+ if [ "$OS" == "ubuntu-latest" ]; then
+ CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
+ pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
+ else
+ pip install "xllamacpp>=0.2.0"
+ fi
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From e3f4f0f3a6b4511ec83b46579bbd201a457bb646 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:32:05 +0800
Subject: [PATCH 12/18] test CI thread error
---
.github/workflows/python.yaml | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 869817e7a6..ed1e370c6c 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,13 +138,7 @@ jobs:
fi
pip install -U pip setuptools wheel scikit-build-core ninja cmake
pip install -e ".[dev]"
- # Linux builds from source with safe CPU flags; other OS use prebuilt wheels
- if [ "$OS" == "ubuntu-latest" ]; then
- CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
- pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
- else
- pip install "xllamacpp>=0.2.0"
- fi
+ pip install -e xllamacpp>=0.2.0
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From 3bd495ad7c7d2f1e91607bf8e5773e457e716fb0 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:37:32 +0800
Subject: [PATCH 13/18] test CI thread error
---
.github/workflows/python.yaml | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index ed1e370c6c..c8a0728e0e 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -133,12 +133,9 @@ jobs:
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- sudo apt-get update
- sudo apt-get install -y build-essential cmake ninja-build
fi
- pip install -U pip setuptools wheel scikit-build-core ninja cmake
pip install -e ".[dev]"
- pip install -e xllamacpp>=0.2.0
+ pip install "xllamacpp>=0.2.0"
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install "mlx>=0.22.0"
From 3cae81e854b7d8896b1ea165b3f8e5935324986c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 15:06:16 +0800
Subject: [PATCH 14/18] add real test for llama cpp json schema
---
.../llm/llama_cpp/tests/test_structured.py | 122 +++++++++++++++---
1 file changed, 107 insertions(+), 15 deletions(-)
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index 6a1e5367c4..d1b50b82ee 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -1,10 +1,63 @@
+import importlib
+import importlib.util
+import json
import sys
+from enum import Enum
from types import SimpleNamespace
+import pytest
+from pydantic import BaseModel
+
+from ..core import _apply_response_format
+
+
+class CarType(str, Enum):
+ sedan = "sedan"
+ suv = "SuV"
+ truck = "Truck"
+ coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+ brand: str
+ model: str
+ car_type: CarType
+
+
+def _load_json_from_message(message) -> dict:
+ def _strip_think(text: str) -> str:
+ stripped = text.lstrip()
+ if stripped.startswith(""):
+ if "" in stripped:
+ stripped = stripped.split("", 1)[1]
+ else:
+ stripped = stripped.split("", 1)[1]
+ return stripped.lstrip()
+
+ raw_content = message.content
+ if isinstance(raw_content, str):
+ return json.loads(_strip_think(raw_content))
+
+ if isinstance(raw_content, list):
+ text_blocks = []
+ for block in raw_content:
+ if isinstance(block, dict):
+ if block.get("type") == "text" and "text" in block:
+ text_blocks.append(_strip_think(block["text"]))
+ continue
+
+ block_type = getattr(block, "type", None)
+ block_text = getattr(block, "text", None)
+ if block_type == "text" and block_text:
+ text_blocks.append(_strip_think(block_text))
+
+ if text_blocks:
+ return json.loads("".join(text_blocks))
+
+ pytest.fail(f"Unexpected message content format: {raw_content!r}")
-def test_apply_response_format_sets_grammar(monkeypatch):
- from xinference.model.llm.llama_cpp.core import _apply_response_format
+def test_apply_response_format_sets_grammar(monkeypatch):
fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=lambda schema: "GRAMMAR")
monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp)
@@ -29,8 +82,6 @@ def test_apply_response_format_sets_grammar(monkeypatch):
def test_apply_response_format_handles_conversion_failure(monkeypatch):
- from xinference.model.llm.llama_cpp.core import _apply_response_format
-
def _raise(_):
raise ValueError("bad schema")
@@ -58,8 +109,6 @@ def _raise(_):
def test_apply_response_format_ignores_non_schema(monkeypatch):
- from xinference.model.llm.llama_cpp.core import _apply_response_format
-
cfg = {"response_format": {"type": "json_object"}}
_apply_response_format(cfg)
assert "grammar" not in cfg
@@ -67,21 +116,12 @@ def test_apply_response_format_ignores_non_schema(monkeypatch):
def test_apply_response_format_uses_real_xllamacpp_if_available():
- import importlib.util
-
- import pytest
-
if importlib.util.find_spec("xllamacpp") is None:
pytest.skip("xllamacpp not installed")
-
- import importlib
-
xllamacpp = importlib.import_module("xllamacpp")
if not hasattr(xllamacpp, "json_schema_to_grammar"):
pytest.skip("xllamacpp does not expose json_schema_to_grammar")
- from xinference.model.llm.llama_cpp.core import _apply_response_format
-
cfg = {
"response_format": {
"type": "json_schema",
@@ -101,3 +141,55 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
# Real xllamacpp should attach grammar alongside json_schema
assert "json_schema" in cfg
assert "grammar" in cfg and cfg["grammar"]
+
+
+def test_llamacpp_qwen3_json_schema(setup):
+ pytest.importorskip(
+ "llama_cpp", reason="llama_cpp backend is required for this test"
+ )
+ import openai
+
+ from xinference.client import Client
+
+ endpoint, _ = setup
+ client = Client(endpoint)
+ model_uid = client.launch_model(
+ model_name="qwen3",
+ model_engine="llama.cpp",
+ model_size_in_billions="0_6",
+ model_format="ggufv2",
+ quantization="Q4_K_M",
+ n_gpu=None,
+ )
+
+ try:
+ api_client = openai.Client(api_key="not empty", base_url=f"{endpoint}/v1")
+ completion = api_client.chat.completions.create(
+ model=model_uid,
+ messages=[
+ {
+ "role": "user",
+ "content": (
+ "Generate a JSON containing the brand, model, and car_type of"
+ " an iconic 90s car."
+ ),
+ }
+ ],
+ temperature=0,
+ max_tokens=128,
+ response_format={
+ "type": "json_schema",
+ "json_schema": {
+ "name": "car-description",
+ "schema": CarDescription.model_json_schema(),
+ },
+ },
+ )
+
+ parsed = _load_json_from_message(completion.choices[0].message)
+ car_description = CarDescription.model_validate(parsed)
+ assert car_description.brand
+ assert car_description.model
+ finally:
+ if model_uid is not None:
+ client.terminate_model(model_uid)
From 7cb457bdc1db69c157c71fd8643e1c0d1d5f31b1 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 15:13:17 +0800
Subject: [PATCH 15/18] add real test for llama cpp json schema
---
xinference/model/llm/llama_cpp/tests/test_structured.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index d1b50b82ee..1f6b54d06a 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -4,6 +4,7 @@
import sys
from enum import Enum
from types import SimpleNamespace
+from typing import Any, Dict
import pytest
from pydantic import BaseModel
@@ -24,7 +25,7 @@ class CarDescription(BaseModel):
car_type: CarType
-def _load_json_from_message(message) -> dict:
+def _load_json_from_message(message: Any) -> Dict[str, Any]:
def _strip_think(text: str) -> str:
stripped = text.lstrip()
if stripped.startswith(""):
@@ -55,6 +56,7 @@ def _strip_think(text: str) -> str:
return json.loads("".join(text_blocks))
pytest.fail(f"Unexpected message content format: {raw_content!r}")
+ raise AssertionError("Unreachable")
def test_apply_response_format_sets_grammar(monkeypatch):
From fb244c54a89621315bcda30d8957f22cff243b1a Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 10:49:57 +0800
Subject: [PATCH 16/18] fix test
---
xinference/model/llm/llama_cpp/tests/test_structured.py | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index 1f6b54d06a..c8fd45739a 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -6,9 +6,12 @@
from types import SimpleNamespace
from typing import Any, Dict
+import openai
import pytest
from pydantic import BaseModel
+from xinference.client import Client
+
from ..core import _apply_response_format
@@ -146,12 +149,6 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
def test_llamacpp_qwen3_json_schema(setup):
- pytest.importorskip(
- "llama_cpp", reason="llama_cpp backend is required for this test"
- )
- import openai
-
- from xinference.client import Client
endpoint, _ = setup
client = Client(endpoint)
From 3642ea08b73a0bab0507962079d0954992237ccd Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 10:50:28 +0800
Subject: [PATCH 17/18] fix test
---
xinference/model/llm/llama_cpp/tests/test_structured.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index c8fd45739a..94c15fd059 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -149,7 +149,6 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
def test_llamacpp_qwen3_json_schema(setup):
-
endpoint, _ = setup
client = Client(endpoint)
model_uid = client.launch_model(
From 268b227eeff12492074f8d844fa690372e294e82 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 11:40:21 +0800
Subject: [PATCH 18/18] fix ci error
---
xinference/model/llm/llama_cpp/core.py | 7 +++++--
xinference/model/llm/llama_cpp/tests/test_structured.py | 6 +++---
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 33244d4ab9..3e619f0e42 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -51,10 +51,13 @@ def _apply_response_format(generate_config: Dict[str, Any]) -> None:
schema_dict = normalized.get("schema_dict")
if not schema_dict:
return
- generate_config.setdefault("json_schema", schema_dict)
grammar = _schema_to_grammar(schema_dict)
if grammar:
- generate_config.setdefault("grammar", grammar)
+ # xllamacpp rejects configs containing both json_schema and grammar
+ generate_config.pop("json_schema", None)
+ generate_config["grammar"] = grammar
+ else:
+ generate_config.setdefault("json_schema", schema_dict)
class _Done:
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index 94c15fd059..1f2c5bc499 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -82,7 +82,7 @@ def test_apply_response_format_sets_grammar(monkeypatch):
_apply_response_format(cfg)
assert "response_format" not in cfg
- assert cfg["json_schema"]["required"] == ["a"]
+ assert "json_schema" not in cfg
assert cfg["grammar"] == "GRAMMAR"
@@ -143,8 +143,8 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
_apply_response_format(cfg)
assert "response_format" not in cfg
- # Real xllamacpp should attach grammar alongside json_schema
- assert "json_schema" in cfg
+ # Real xllamacpp should prefer grammar to avoid passing both
+ assert "json_schema" not in cfg
assert "grammar" in cfg and cfg["grammar"]