Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

---

## [Unreleased]

### Fixed

- **NERExtractor LLM method returning pattern-based output on custom gateways** (#554, PR #556) by @KaifAhmad1

`NERExtractor(method="llm")` silently fell back to regex/pattern extraction when used with OpenAI-compatible enterprise or self-hosted gateways (Qwen, LLaMA proxies, internal routing layers). Returned entities carried `extraction_method='pattern'` even though the LLM itself was producing correct tool-call output. Three root causes fixed:

- **Silent exception swallowing** — `exc_info=True` was missing from the method-failure `WARNING` in `NERExtractor.extract_entities`. The full gateway-rejection traceback was invisible in logs even with `DEBUG` level enabled, making the failure impossible to diagnose without reading source code.

- **`response_format=json_object` sent to incompatible gateways** — `OpenAIProvider.generate_structured` unconditionally included `response_format={"type": "json_object"}` in every API call. Custom/enterprise gateways frequently reject this parameter, causing both the `instructor` path and the manual repair loop to fail with the same error on every retry, eventually triggering `_extract_fallback` (pattern extraction).

- **No fallback in the `generate_typed` manual repair loop** — when `generate_structured` itself raised (due to gateway rejection), the repair loop retried the identical failing call up to `max_retries` times before giving up. There was no path to recover via plain `generate()` + JSON parsing.

**Additional fixes applied during PR review:**

- Mode.JSON retry in `generate_typed` now strips `response_format` from `create_kwargs` before forwarding to the retry client, preventing incompatible kwargs from being sent to a client configured for a different instructor mode.
- `exc_info=True` added to the `generate_structured` fallback warning in the manual repair loop for consistent observability across all failure paths.
- Removed dead duplicate `is_available` definition in `GroqProvider` — Python silently kept only the second definition; the first was unreachable.
- `OpenAIProvider._init_client` now validates `base_url` scheme at construction time. Non-HTTP(S) schemes (`file://`, `ftp://`, `javascript:`, etc.) raise `ValueError` immediately, preventing SSRF if `base_url` originates from configuration rather than hardcoded values.

**17 regression tests** added in `tests/test_issue_554_fixes.py` covering all bug paths, including harshalizode's exact gateway configuration.

---

## [0.5.0] - 2026-05-11

### Added
Expand Down
10 changes: 9 additions & 1 deletion semantica/semantic_extract/ner_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ def __init__(
- huggingface_model: HuggingFace model name
- provider: LLM provider (for LLM method)
- llm_model: LLM model name
- base_url: Custom base URL for OpenAI-compatible endpoints
(e.g. ``"https://my-gateway/v1"``). When set, the
provider automatically switches to ``Mode.JSON`` so that
third-party servers (Qwen, LLaMA gateways, etc.) that do
not implement the full function-calling protocol still
return correctly structured results.
- device: Device for HuggingFace models ("cuda" or "cpu")
- min_confidence: Minimum confidence threshold
- ensemble_voting: Enable ensemble voting (default: False)
Expand Down Expand Up @@ -423,7 +429,9 @@ def extract_entities(
return filtered

except Exception as e:
self.logger.warning(f"Method {method_name} failed: {e}")
self.logger.warning(
"Method %s failed: %s", method_name, e, exc_info=True
)
continue

# Ensemble voting if enabled
Expand Down
128 changes: 101 additions & 27 deletions semantica/semantic_extract/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,18 @@ def generate_typed(
mode = instructor.Mode.TOOLS # Default mode

if provider_name == "OpenAIProvider" and self.client:
if hasattr(instructor, "from_provider"):
custom_base_url = getattr(self, "base_url", None)
if custom_base_url:
# OpenAI-compatible custom endpoint: Mode.TOOLS is not reliably
# supported by third-party servers (Qwen, LLaMA gateways, etc.).
# Mode.JSON asks the model to return plain JSON and is broadly
# supported across all OpenAI-compatible APIs.
client = instructor.from_openai(self.client, mode=instructor.Mode.JSON)
elif hasattr(instructor, "from_provider"):
try:
client = instructor.from_provider(
provider=f"openai/{kwargs.get('model', self.model)}",
api_key=self.api_key
provider=f"openai/{kwargs.get('model', self.model)}",
api_key=self.api_key,
)
except Exception:
client = instructor.from_openai(self.client)
Expand Down Expand Up @@ -395,25 +402,71 @@ def generate_typed(

if provider_name == "GroqProvider":
create_kwargs["response_format"] = {"type": "json_object"}

response = client.chat.completions.create(**create_kwargs)

try:
response = client.chat.completions.create(**create_kwargs)
except Exception as primary_err:
# Mode.TOOLS can fail on standard OpenAI endpoints for certain
# models (streaming quirks, schema binding issues). Retry once
# with Mode.JSON before giving up entirely.
# Custom-base_url providers already use Mode.JSON from the start,
# so we only retry here for the standard OpenAI path.
if (
provider_name == "OpenAIProvider"
and not getattr(self, "base_url", None)
and hasattr(self, "client")
and self.client
):
self.logger.warning(
"instructor Mode.TOOLS failed for %s (%s); retrying with Mode.JSON.",
provider_name,
primary_err,
exc_info=True,
)
json_client = instructor.from_openai(self.client, mode=instructor.Mode.JSON)
# Build a clean kwargs dict for the Mode.JSON retry: drop
# response_format (Mode.JSON handles schema differently)
# but keep response_model/max_retries so instructor still
# validates the typed output.
retry_kwargs = {
k: v for k, v in create_kwargs.items()
if k != "response_format"
}
response = json_client.chat.completions.create(**retry_kwargs)
else:
raise

verbose_mode = kwargs.get("verbose", False) or self.config.get("verbose", False)
if verbose_mode:
import sys
print(f" [BaseProvider.generate_typed] Typed response received via instructor ({provider_name}).", flush=True, file=sys.stdout)
return response
except Exception as e:
self.logger.warning(f"Instructor generation failed ({e}), falling back to manual repair loop.")
self.logger.warning(
"Instructor generation failed (%s), falling back to manual repair loop.",
e,
exc_info=True,
)

# Fallback: Manual repair loop
last_error = None
current_prompt = prompt

for attempt in range(max_retries):
try:
# 1. Generate JSON
# We use generate_structured to get the dict/list
json_result = self.generate_structured(current_prompt, max_retries=1, **kwargs)
# 1. Generate JSON – try structured mode first, then fall back to
# plain generate() + parse. Custom gateways that reject
# response_format=json_object would otherwise loop forever here.
try:
json_result = self.generate_structured(current_prompt, max_retries=1, **kwargs)
except Exception as struct_err:
self.logger.warning(
"generate_structured failed (%s); retrying with plain generate() + JSON parse.",
struct_err,
exc_info=True,
)
raw_content = self.generate(current_prompt, **kwargs)
json_result = self._parse_json(raw_content)

# 2. Validate with Schema
# If the result is a list and schema expects a wrapper, or vice versa, we might need adjustment
Expand Down Expand Up @@ -509,22 +562,51 @@ class OpenAIProvider(BaseProvider):
"""OpenAI provider implementation."""

def __init__(
self, api_key: Optional[str] = None, model: str = "gpt-3.5-turbo", **kwargs
self,
api_key: Optional[str] = None,
model: str = "gpt-3.5-turbo",
base_url: Optional[str] = None,
**kwargs,
):
"""Initialize OpenAI provider."""
"""Initialize OpenAI provider.

Args:
api_key: OpenAI API key (or OPENAI_API_KEY env var).
model: Default model name.
base_url: Optional custom base URL for OpenAI-compatible endpoints
(e.g. local gateways, Qwen, LLaMA proxies). When set,
``instructor`` will use ``Mode.JSON`` instead of
``Mode.TOOLS`` because most OpenAI-compatible servers do not
implement the full function-calling protocol.
"""
super().__init__(**kwargs)
self.api_key = api_key or config.get_api_key("openai")
self.model = model
self.base_url = base_url # None → standard OpenAI; set → custom endpoint
self.client = None
self._init_client()

def _init_client(self):
"""Initialize OpenAI client."""
"""Initialize OpenAI client, respecting a custom base_url if provided."""
if self.base_url:
# Reject non-HTTP schemes (file://, ftp://, etc.) to prevent SSRF
# when base_url originates from configuration rather than hardcoded values.
from urllib.parse import urlparse
scheme = urlparse(self.base_url).scheme
if scheme not in ("http", "https"):
raise ValueError(
f"OpenAIProvider base_url must use http or https, got scheme {scheme!r}. "
f"Only HTTP(S) endpoints are permitted."
)

try:
from openai import OpenAI

if self.api_key:
self.client = OpenAI(api_key=self.api_key)
init_kwargs: Dict[str, Any] = {"api_key": self.api_key}
if self.base_url:
init_kwargs["base_url"] = self.base_url
self.client = OpenAI(**init_kwargs)
except (ImportError, OSError):
self.client = None
self.logger.warning(
Expand Down Expand Up @@ -560,8 +642,13 @@ def generate_structured(self, prompt: str, **kwargs) -> dict:
create_kwargs = {
"model": kwargs.get("model", self.model),
"messages": [{"role": "user", "content": prompt}],
"response_format": {"type": "json_object"},
}
# response_format=json_object is only safe for standard OpenAI endpoints.
# Custom gateways (base_url set) often reject or mishandle this parameter,
# causing silent fallback to pattern extraction.
if not self.base_url:
create_kwargs["response_format"] = {"type": "json_object"}

self._add_if_set(create_kwargs, kwargs, "temperature", "max_completion_tokens", "max_tokens",
"top_p", "frequency_penalty", "presence_penalty", "seed", "stop", "logit_bias", "user")

Expand Down Expand Up @@ -696,19 +783,6 @@ def _init_client(self):
self.client = None
self.logger.error(f"Failed to initialize Groq client: {e}")

def is_available(self) -> bool:
"""Check if provider is available and return diagnostic info."""
if self.client is None:
if not self.api_key:
return False # Missing API key
try:
from groq import Groq
except ImportError:
return False # Library not installed
return False

return True

def _test_connection(self):
"""Internal method to verify connection."""
if not self.client:
Expand Down
Loading
Loading