From 757af7ce14a4d24ac20556d6248e546c17d6a268 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Tue, 25 Nov 2025 12:29:24 +0800
Subject: [PATCH 01/18] FEAT: add llama_cpp json schema output

---
 xinference/model/llm/llama_cpp/core.py | 39 ++++++++++++++++++++++++--
 xinference/model/llm/utils.py          | 26 +++++++++++++++++
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 3f06d2f899..e87f10bb00 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -16,7 +16,7 @@
 import os
 import pprint
 import queue
-from typing import Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 from packaging import version
 
@@ -25,11 +25,42 @@
 from ...utils import check_dependency_available
 from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
-from ..utils import ChatModelMixin
+from ..utils import ChatModelMixin, normalize_response_format
 
 logger = logging.getLogger(__name__)
 
 
+def _schema_to_gbnf(schema: Dict[str, Any]) -> Optional[str]:
+    try:
+        from llama_cpp.llama_grammar import (
+            json_schema_to_gbnf,  # type: ignore[attr-defined]
+        )
+    except Exception as e:  # pragma: no cover - optional dependency
+        logger.warning(
+            "json_schema provided but llama_cpp grammar support missing: %s", e
+        )
+        return None
+    try:
+        return json_schema_to_gbnf(schema)
+    except Exception as e:  # pragma: no cover - conversion failure
+        logger.warning("Failed to convert json_schema to GBNF for llama.cpp: %s", e)
+        return None
+
+
+def _apply_response_format(generate_config: Dict[str, Any]) -> None:
+    response_format = generate_config.pop("response_format", None)
+    normalized = normalize_response_format(response_format)
+    if not normalized or normalized.get("type") != "json_schema":
+        return
+    schema_dict = normalized.get("schema_dict")
+    if not schema_dict:
+        return
+    generate_config.setdefault("json_schema", schema_dict)
+    grammar = _schema_to_gbnf(schema_dict)
+    if grammar:
+        generate_config.setdefault("grammar", grammar)
+
+
 class _Done:
     pass
 
@@ -49,7 +80,7 @@ def __init__(
         model_path: str,
         llamacpp_model_config: Optional[dict] = None,
     ):
-        super().__init__(model_uid, model_family, model_path)
+        super().__init__(model_uid, model_family, model_path)  # type: ignore[call-arg]
         self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
         self._llm = None
         self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
@@ -246,6 +277,7 @@ def generate(
         generate_config = generate_config or {}
         if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
             generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
+        _apply_response_format(generate_config)
         stream = generate_config.get("stream", False)
         q: queue.Queue = queue.Queue()
 
@@ -305,6 +337,7 @@ def chat(
         generate_config = generate_config or {}
         if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
             generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
+        _apply_response_format(generate_config)
         stream = generate_config.get("stream", False)
 
         chat_template_kwargs = (
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 420598dbba..2fc54d9d3a 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -1235,6 +1235,32 @@ def get_stop_token_ids_from_config_file(model_path: str) -> Optional[List[int]]:
     return None
 
 
+def normalize_response_format(
+    response_format: Optional[Dict[str, Any]],
+) -> Optional[Dict[str, Any]]:
+    """
+    Normalize OpenAI-style response_format into a simple dict.
+    Returns:
+        None if missing/unsupported, or a dict with keys:
+            - type: "json_schema" | "json_object"
+            - schema_dict: dict (only for json_schema)
+    """
+    if not response_format or not isinstance(response_format, dict):
+        return None
+
+    fmt_type = response_format.get("type")
+    if fmt_type not in ("json_schema", "json_object"):
+        return None
+
+    normalized: Dict[str, Any] = {"type": fmt_type}
+    if fmt_type == "json_schema":
+        schema_block = response_format.get("json_schema") or {}
+        schema_dict = schema_block.get("schema_") or schema_block.get("schema")
+        if schema_dict:
+            normalized["schema_dict"] = schema_dict
+    return normalized
+
+
 def parse_messages(messages: List[Dict]) -> Tuple:
     """
     Some older models still follow the old way of parameter passing.

From 0501936a7d2d0647be51e644b9458b250a2f1069 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 27 Nov 2025 11:26:50 +0800
Subject: [PATCH 02/18] modify llama_cpp_python to xllamacpp

---
 xinference/model/llm/llama_cpp/core.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index e87f10bb00..33244d4ab9 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -30,20 +30,16 @@
 logger = logging.getLogger(__name__)
 
 
-def _schema_to_gbnf(schema: Dict[str, Any]) -> Optional[str]:
+def _schema_to_grammar(schema: Dict[str, Any]) -> Optional[str]:
     try:
-        from llama_cpp.llama_grammar import (
-            json_schema_to_gbnf,  # type: ignore[attr-defined]
-        )
+        import xllamacpp
     except Exception as e:  # pragma: no cover - optional dependency
-        logger.warning(
-            "json_schema provided but llama_cpp grammar support missing: %s", e
-        )
+        logger.warning("json_schema provided but xllamacpp missing: %s", e)
         return None
     try:
-        return json_schema_to_gbnf(schema)
+        return xllamacpp.json_schema_to_grammar(schema)  # type: ignore[attr-defined]
     except Exception as e:  # pragma: no cover - conversion failure
-        logger.warning("Failed to convert json_schema to GBNF for llama.cpp: %s", e)
+        logger.warning("Failed to convert json_schema to grammar for xllamacpp: %s", e)
         return None
 
 
@@ -56,7 +52,7 @@ def _apply_response_format(generate_config: Dict[str, Any]) -> None:
     if not schema_dict:
         return
     generate_config.setdefault("json_schema", schema_dict)
-    grammar = _schema_to_gbnf(schema_dict)
+    grammar = _schema_to_grammar(schema_dict)
     if grammar:
         generate_config.setdefault("grammar", grammar)
 

From 0c0eead87fbeb96d823553235d53933405167aed Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 27 Nov 2025 14:56:55 +0800
Subject: [PATCH 03/18] add a test python

---
 .../llm/llama_cpp/tests/test_structured.py    | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 xinference/model/llm/llama_cpp/tests/test_structured.py

diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
new file mode 100644
index 0000000000..ed64b3a84b
--- /dev/null
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -0,0 +1,66 @@
+import sys
+from types import SimpleNamespace
+
+
+def test_apply_response_format_sets_grammar(monkeypatch):
+    from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+    fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=lambda schema: "GRAMMAR")
+    monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp)
+
+    cfg = {
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "schema": {
+                    "type": "object",
+                    "properties": {"a": {"type": "string"}},
+                    "required": ["a"],
+                }
+            },
+        }
+    }
+
+    _apply_response_format(cfg)
+
+    assert "response_format" not in cfg
+    assert cfg["json_schema"]["required"] == ["a"]
+    assert cfg["grammar"] == "GRAMMAR"
+
+
+def test_apply_response_format_handles_conversion_failure(monkeypatch):
+    from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+    def _raise(_):
+        raise ValueError("bad schema")
+
+    fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=_raise)
+    monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp)
+
+    cfg = {
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "schema": {
+                    "type": "object",
+                    "properties": {"b": {"type": "string"}},
+                    "required": ["b"],
+                }
+            },
+        }
+    }
+
+    _apply_response_format(cfg)
+
+    assert "response_format" not in cfg
+    assert cfg["json_schema"]["required"] == ["b"]
+    assert "grammar" not in cfg
+
+
+def test_apply_response_format_ignores_non_schema(monkeypatch):
+    from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+    cfg = {"response_format": {"type": "json_object"}}
+    _apply_response_format(cfg)
+    assert "grammar" not in cfg
+    assert "json_schema" not in cfg

From 288c99ae7c5c27fd92c3c1abcc21ff9dad03aa30 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 1 Dec 2025 10:56:09 +0800
Subject: [PATCH 04/18] add new xllamacpp unit test

---
 .../llm/llama_cpp/tests/test_structured.py    | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index ed64b3a84b..6a1e5367c4 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -64,3 +64,40 @@ def test_apply_response_format_ignores_non_schema(monkeypatch):
     _apply_response_format(cfg)
     assert "grammar" not in cfg
     assert "json_schema" not in cfg
+
+
+def test_apply_response_format_uses_real_xllamacpp_if_available():
+    import importlib.util
+
+    import pytest
+
+    if importlib.util.find_spec("xllamacpp") is None:
+        pytest.skip("xllamacpp not installed")
+
+    import importlib
+
+    xllamacpp = importlib.import_module("xllamacpp")
+    if not hasattr(xllamacpp, "json_schema_to_grammar"):
+        pytest.skip("xllamacpp does not expose json_schema_to_grammar")
+
+    from xinference.model.llm.llama_cpp.core import _apply_response_format
+
+    cfg = {
+        "response_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "schema": {
+                    "type": "object",
+                    "properties": {"c": {"type": "integer"}},
+                    "required": ["c"],
+                }
+            },
+        }
+    }
+
+    _apply_response_format(cfg)
+
+    assert "response_format" not in cfg
+    # Real xllamacpp should attach grammar alongside json_schema
+    assert "json_schema" in cfg
+    assert "grammar" in cfg and cfg["grammar"]

From 1dbc12df45940e30c00164a57417fec326753606 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 15:56:06 +0800
Subject: [PATCH 05/18] test CI thread error

---
 .github/workflows/python.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index c8a0728e0e..b1f8ed3d25 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -135,7 +135,7 @@ jobs:
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
           fi
           pip install -e ".[dev]"
-          pip install "xllamacpp>=0.2.0"
+          pip install "xllamacpp==0.2.5"
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From 464ca84738f4738d204af4d9f148b59396d3c235 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 16:22:29 +0800
Subject: [PATCH 06/18] test CI thread error

---
 .github/workflows/python.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index b1f8ed3d25..574e5c122a 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -135,7 +135,8 @@ jobs:
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
           fi
           pip install -e ".[dev]"
-          pip install "xllamacpp==0.2.5"
+          CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" \
+          pip install --no-binary xllamacpp xllamacpp>=0.2.6
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From 4739b39f4a30abe622c3c1c1cfce146c6f404c15 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 16:32:31 +0800
Subject: [PATCH 07/18] test CI thread error

---
 .github/workflows/python.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 574e5c122a..e4ae21c3ef 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -133,10 +133,12 @@ jobs:
             sudo rm -rf /opt/ghc
             sudo rm -rf "/usr/local/share/boost"
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+            sudo apt-get update
+            sudo apt-get install -y build-essential cmake ninja-build
           fi
           pip install -e ".[dev]"
-          CMAKE_ARGS="-DGGML_NATIVE=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" \
-          pip install --no-binary xllamacpp xllamacpp>=0.2.6
+          CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_FMA=ON -DLLAMA_F16C=ON" \
+          pip install --no-binary xllamacpp xllamacpp>=0.2.0
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From 7ad1c71b2ce62abbee9cf46c7e37c00edd404006 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 16:53:00 +0800
Subject: [PATCH 08/18] test CI thread error

---
 .github/workflows/python.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index e4ae21c3ef..45ebf4c708 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -136,8 +136,10 @@ jobs:
             sudo apt-get update
             sudo apt-get install -y build-essential cmake ninja-build
           fi
+          pip install -U pip setuptools wheel scikit-build-core ninja cmake
           pip install -e ".[dev]"
-          CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_FMA=ON -DLLAMA_F16C=ON" \
+          # Build xllamacpp 0.2.6 from source with safe CPU flags (no AVX512/native) to avoid SIGILL
+          CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
           pip install --no-binary xllamacpp xllamacpp>=0.2.0
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"

From de7c64e268e521dac7d85b3641ed61d19973fd62 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:04:37 +0800
Subject: [PATCH 09/18] test CI thread error

---
 .github/workflows/python.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 45ebf4c708..49ecbf1fca 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,9 +138,8 @@ jobs:
           fi
           pip install -U pip setuptools wheel scikit-build-core ninja cmake
           pip install -e ".[dev]"
-          # Build xllamacpp 0.2.6 from source with safe CPU flags (no AVX512/native) to avoid SIGILL
           CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
-          pip install --no-binary xllamacpp xllamacpp>=0.2.0
+          pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From 1762822889be8c3dbd5ca9a97ceeadae0e71317e Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:11:39 +0800
Subject: [PATCH 10/18] test CI thread error

---
 .github/workflows/python.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 49ecbf1fca..ed1e370c6c 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,8 +138,7 @@ jobs:
           fi
           pip install -U pip setuptools wheel scikit-build-core ninja cmake
           pip install -e ".[dev]"
-          CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
-          pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
+          pip install -e xllamacpp>=0.2.0
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From 6314f43cbd338db246f437ef2f307782195de47a Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:20:49 +0800
Subject: [PATCH 11/18] test CI thread error

---
 .github/workflows/python.yaml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index ed1e370c6c..869817e7a6 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,7 +138,13 @@ jobs:
           fi
           pip install -U pip setuptools wheel scikit-build-core ninja cmake
           pip install -e ".[dev]"
-          pip install -e xllamacpp>=0.2.0
+          # Linux builds from source with safe CPU flags; other OS use prebuilt wheels
+          if [ "$OS" == "ubuntu-latest" ]; then
+            CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
+            pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
+          else
+            pip install "xllamacpp>=0.2.0"
+          fi
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From e3f4f0f3a6b4511ec83b46579bbd201a457bb646 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:32:05 +0800
Subject: [PATCH 12/18] test CI thread error

---
 .github/workflows/python.yaml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 869817e7a6..ed1e370c6c 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -138,13 +138,7 @@ jobs:
           fi
           pip install -U pip setuptools wheel scikit-build-core ninja cmake
           pip install -e ".[dev]"
-          # Linux builds from source with safe CPU flags; other OS use prebuilt wheels
-          if [ "$OS" == "ubuntu-latest" ]; then
-            CMAKE_ARGS="-DLLAMA_NATIVE=OFF -DLLAMA_AVX512=OFF" \
-            pip install --no-binary xllamacpp "xllamacpp>=0.2.0"
-          else
-            pip install "xllamacpp>=0.2.0"
-          fi
+          pip install -e xllamacpp>=0.2.0
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From 3bd495ad7c7d2f1e91607bf8e5773e457e716fb0 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 8 Dec 2025 17:37:32 +0800
Subject: [PATCH 13/18] test CI thread error

---
 .github/workflows/python.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index ed1e370c6c..c8a0728e0e 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -133,12 +133,9 @@ jobs:
             sudo rm -rf /opt/ghc
             sudo rm -rf "/usr/local/share/boost"
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-            sudo apt-get update
-            sudo apt-get install -y build-essential cmake ninja-build
           fi
-          pip install -U pip setuptools wheel scikit-build-core ninja cmake
           pip install -e ".[dev]"
-          pip install -e xllamacpp>=0.2.0
+          pip install "xllamacpp>=0.2.0"
           if [ "$MODULE" == "metal" ]; then
             conda install -c conda-forge "ffmpeg<7"
             pip install "mlx>=0.22.0"

From 3cae81e854b7d8896b1ea165b3f8e5935324986c Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 15:06:16 +0800
Subject: [PATCH 14/18] add real test for llama cpp json schema

---
 .../llm/llama_cpp/tests/test_structured.py    | 122 +++++++++++++++---
 1 file changed, 107 insertions(+), 15 deletions(-)

diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index 6a1e5367c4..d1b50b82ee 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -1,10 +1,63 @@
+import importlib
+import importlib.util
+import json
 import sys
+from enum import Enum
 from types import SimpleNamespace
 
+import pytest
+from pydantic import BaseModel
+
+from ..core import _apply_response_format
+
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SuV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+def _load_json_from_message(message) -> dict:
+    def _strip_think(text: str) -> str:
+        stripped = text.lstrip()
+        if stripped.startswith("<think>"):
+            if "</think>" in stripped:
+                stripped = stripped.split("</think>", 1)[1]
+            else:
+                stripped = stripped.split("<think>", 1)[1]
+        return stripped.lstrip()
+
+    raw_content = message.content
+    if isinstance(raw_content, str):
+        return json.loads(_strip_think(raw_content))
+
+    if isinstance(raw_content, list):
+        text_blocks = []
+        for block in raw_content:
+            if isinstance(block, dict):
+                if block.get("type") == "text" and "text" in block:
+                    text_blocks.append(_strip_think(block["text"]))
+                continue
+
+            block_type = getattr(block, "type", None)
+            block_text = getattr(block, "text", None)
+            if block_type == "text" and block_text:
+                text_blocks.append(_strip_think(block_text))
+
+        if text_blocks:
+            return json.loads("".join(text_blocks))
+
+    pytest.fail(f"Unexpected message content format: {raw_content!r}")
 
-def test_apply_response_format_sets_grammar(monkeypatch):
-    from xinference.model.llm.llama_cpp.core import _apply_response_format
 
+def test_apply_response_format_sets_grammar(monkeypatch):
     fake_xllamacpp = SimpleNamespace(json_schema_to_grammar=lambda schema: "GRAMMAR")
     monkeypatch.setitem(sys.modules, "xllamacpp", fake_xllamacpp)
 
@@ -29,8 +82,6 @@ def test_apply_response_format_sets_grammar(monkeypatch):
 
 
 def test_apply_response_format_handles_conversion_failure(monkeypatch):
-    from xinference.model.llm.llama_cpp.core import _apply_response_format
-
     def _raise(_):
         raise ValueError("bad schema")
 
@@ -58,8 +109,6 @@ def _raise(_):
 
 
 def test_apply_response_format_ignores_non_schema(monkeypatch):
-    from xinference.model.llm.llama_cpp.core import _apply_response_format
-
     cfg = {"response_format": {"type": "json_object"}}
     _apply_response_format(cfg)
     assert "grammar" not in cfg
@@ -67,21 +116,12 @@ def test_apply_response_format_ignores_non_schema(monkeypatch):
 
 
 def test_apply_response_format_uses_real_xllamacpp_if_available():
-    import importlib.util
-
-    import pytest
-
     if importlib.util.find_spec("xllamacpp") is None:
         pytest.skip("xllamacpp not installed")
-
-    import importlib
-
     xllamacpp = importlib.import_module("xllamacpp")
     if not hasattr(xllamacpp, "json_schema_to_grammar"):
         pytest.skip("xllamacpp does not expose json_schema_to_grammar")
 
-    from xinference.model.llm.llama_cpp.core import _apply_response_format
-
     cfg = {
         "response_format": {
             "type": "json_schema",
@@ -101,3 +141,55 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
     # Real xllamacpp should attach grammar alongside json_schema
     assert "json_schema" in cfg
     assert "grammar" in cfg and cfg["grammar"]
+
+
+def test_llamacpp_qwen3_json_schema(setup):
+    pytest.importorskip(
+        "llama_cpp", reason="llama_cpp backend is required for this test"
+    )
+    import openai
+
+    from xinference.client import Client
+
+    endpoint, _ = setup
+    client = Client(endpoint)
+    model_uid = client.launch_model(
+        model_name="qwen3",
+        model_engine="llama.cpp",
+        model_size_in_billions="0_6",
+        model_format="ggufv2",
+        quantization="Q4_K_M",
+        n_gpu=None,
+    )
+
+    try:
+        api_client = openai.Client(api_key="not empty", base_url=f"{endpoint}/v1")
+        completion = api_client.chat.completions.create(
+            model=model_uid,
+            messages=[
+                {
+                    "role": "user",
+                    "content": (
+                        "Generate a JSON containing the brand, model, and car_type of"
+                        " an iconic 90s car."
+                    ),
+                }
+            ],
+            temperature=0,
+            max_tokens=128,
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "car-description",
+                    "schema": CarDescription.model_json_schema(),
+                },
+            },
+        )
+
+        parsed = _load_json_from_message(completion.choices[0].message)
+        car_description = CarDescription.model_validate(parsed)
+        assert car_description.brand
+        assert car_description.model
+    finally:
+        if model_uid is not None:
+            client.terminate_model(model_uid)

From 7cb457bdc1db69c157c71fd8643e1c0d1d5f31b1 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Thu, 11 Dec 2025 15:13:17 +0800
Subject: [PATCH 15/18] add real test for llama cpp json schema

---
 xinference/model/llm/llama_cpp/tests/test_structured.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index d1b50b82ee..1f6b54d06a 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -4,6 +4,7 @@
 import sys
 from enum import Enum
 from types import SimpleNamespace
+from typing import Any, Dict
 
 import pytest
 from pydantic import BaseModel
@@ -24,7 +25,7 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-def _load_json_from_message(message) -> dict:
+def _load_json_from_message(message: Any) -> Dict[str, Any]:
     def _strip_think(text: str) -> str:
         stripped = text.lstrip()
         if stripped.startswith("<think>"):
@@ -55,6 +56,7 @@ def _strip_think(text: str) -> str:
             return json.loads("".join(text_blocks))
 
     pytest.fail(f"Unexpected message content format: {raw_content!r}")
+    raise AssertionError("Unreachable")
 
 
 def test_apply_response_format_sets_grammar(monkeypatch):

From fb244c54a89621315bcda30d8957f22cff243b1a Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 10:49:57 +0800
Subject: [PATCH 16/18] fix test

---
 xinference/model/llm/llama_cpp/tests/test_structured.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index 1f6b54d06a..c8fd45739a 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -6,9 +6,12 @@
 from types import SimpleNamespace
 from typing import Any, Dict
 
+import openai
 import pytest
 from pydantic import BaseModel
 
+from xinference.client import Client
+
 from ..core import _apply_response_format
 
 
@@ -146,12 +149,6 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
 
 
 def test_llamacpp_qwen3_json_schema(setup):
-    pytest.importorskip(
-        "llama_cpp", reason="llama_cpp backend is required for this test"
-    )
-    import openai
-
-    from xinference.client import Client
 
     endpoint, _ = setup
     client = Client(endpoint)

From 3642ea08b73a0bab0507962079d0954992237ccd Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 10:50:28 +0800
Subject: [PATCH 17/18] fix test

---
 xinference/model/llm/llama_cpp/tests/test_structured.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index c8fd45739a..94c15fd059 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -149,7 +149,6 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
 
 
 def test_llamacpp_qwen3_json_schema(setup):
-
     endpoint, _ = setup
     client = Client(endpoint)
     model_uid = client.launch_model(

From 268b227eeff12492074f8d844fa690372e294e82 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 12 Dec 2025 11:40:21 +0800
Subject: [PATCH 18/18] fix ci error

---
 xinference/model/llm/llama_cpp/core.py                  | 7 +++++--
 xinference/model/llm/llama_cpp/tests/test_structured.py | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
index 33244d4ab9..3e619f0e42 100644
--- a/xinference/model/llm/llama_cpp/core.py
+++ b/xinference/model/llm/llama_cpp/core.py
@@ -51,10 +51,13 @@ def _apply_response_format(generate_config: Dict[str, Any]) -> None:
     schema_dict = normalized.get("schema_dict")
     if not schema_dict:
         return
-    generate_config.setdefault("json_schema", schema_dict)
     grammar = _schema_to_grammar(schema_dict)
     if grammar:
-        generate_config.setdefault("grammar", grammar)
+        # xllamacpp rejects configs containing both json_schema and grammar
+        generate_config.pop("json_schema", None)
+        generate_config["grammar"] = grammar
+    else:
+        generate_config.setdefault("json_schema", schema_dict)
 
 
 class _Done:
diff --git a/xinference/model/llm/llama_cpp/tests/test_structured.py b/xinference/model/llm/llama_cpp/tests/test_structured.py
index 94c15fd059..1f2c5bc499 100644
--- a/xinference/model/llm/llama_cpp/tests/test_structured.py
+++ b/xinference/model/llm/llama_cpp/tests/test_structured.py
@@ -82,7 +82,7 @@ def test_apply_response_format_sets_grammar(monkeypatch):
     _apply_response_format(cfg)
 
     assert "response_format" not in cfg
-    assert cfg["json_schema"]["required"] == ["a"]
+    assert "json_schema" not in cfg
     assert cfg["grammar"] == "GRAMMAR"
 
 
@@ -143,8 +143,8 @@ def test_apply_response_format_uses_real_xllamacpp_if_available():
     _apply_response_format(cfg)
 
     assert "response_format" not in cfg
-    # Real xllamacpp should attach grammar alongside json_schema
-    assert "json_schema" in cfg
+    # Real xllamacpp should prefer grammar to avoid passing both
+    assert "json_schema" not in cfg
     assert "grammar" in cfg and cfg["grammar"]