Skip to content

Commit afbda95

Browse files
feat: custom fallback for language detection (#4238)
Closes #4091 Implements custom fallback for language detection so short text is not forced to English and callers can control or disable detection. ## Changes: - language_fallback Optional callable used when text is short (<5 words) and ASCII. It receives the text and can return a list of ISO 639-3 codes or None to leave language unspecified. If not provided, short text still defaults to ["eng"] (backward compatible). - detect_languages() / apply_lang_metadata() New parameter language_fallback; applied in the short-text path only. - partition() (auto) New parameter language_fallback; passed through to all partitioners via the metadata decorator. - partition_md() New parameter languages so callers can pass languages=[""] to disable language detection (aligned with other partitioners). ## Usage: - Return None for short text: partition(..., language_fallback=lambda text: None) - Custom short-text language: partition(..., language_fallback=my_detector) - Disable detection: partition_md(..., languages=[""]) or partition(..., languages=[""])
1 parent 4a77a8c commit afbda95

File tree

11 files changed

+1138
-795
lines changed

11 files changed

+1138
-795
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.21.3
2+
3+
### Enhancements
4+
- **Custom fallback for language detection (fixes #4091)**: Add optional `language_fallback` callable for short ASCII text (e.g. when detection would default to English). Callable receives the text and may return a list of ISO 639-3 codes or `None` to leave language unspecified; return value is validated and invalid entries are filtered out. `language_fallback` is passed through `partition()`, PDF/image partitioners, and `partition_html`; `partition_md` now accepts `languages` (use `[""]` to disable detection). Language-related parameters (`languages`, `detect_language_per_element`, `language_fallback`) are documented as top-level options and exposed explicitly on `partition_html`.
5+
16
## 0.21.2
27

38
### Fixes

test_unstructured/partition/common/test_lang.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,45 @@ def test_detect_languages_handles_spelled_out_languages():
196196
assert languages == ["spa"]
197197

198198

199+
def test_detect_languages_short_text_fallback_returns_none():
200+
"""Short ASCII text with language_fallback returning None leaves language unspecified."""
201+
result = detect_languages(
202+
text="Hi there.",
203+
language_fallback=lambda t: None,
204+
)
205+
assert result is None
206+
207+
208+
def test_detect_languages_short_text_fallback_returns_custom():
209+
"""Short ASCII text triggers fallback; we assert the fallback's return is used as-is."""
210+
# Any short (<5 word) ASCII text would hit the fallback; content is irrelevant.
211+
result = detect_languages(
212+
text="Hi there.",
213+
language_fallback=lambda t: ["fra"],
214+
)
215+
assert result == ["fra"]
216+
217+
218+
def test_detect_languages_short_text_default_eng_without_fallback():
219+
"""Short ASCII text without fallback still defaults to English (backward compat)."""
220+
result = detect_languages(text="Hi there.")
221+
assert result == ["eng"]
222+
223+
224+
def test_apply_lang_metadata_with_language_fallback():
225+
"""apply_lang_metadata passes language_fallback so short text can return None."""
226+
elements = [NarrativeText("Hi.")]
227+
result = list(
228+
apply_lang_metadata(
229+
elements=elements,
230+
languages=["auto"],
231+
language_fallback=lambda t: None,
232+
)
233+
)
234+
assert len(result) == 1
235+
assert result[0].metadata.languages is None
236+
237+
199238
@pytest.mark.parametrize(
200239
("languages", "ocr_languages", "expected_langs"),
201240
[

test_unstructured/partition/test_auto.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
563563
languages=None,
564564
metadata_filename=None,
565565
detect_language_per_element=False,
566+
language_fallback=None,
566567
infer_table_structure=False,
567568
extract_images_in_pdf=False,
568569
extract_image_block_types=None,
@@ -1074,11 +1075,34 @@ def test_auto_partition_respects_detect_language_per_element_arg():
10741075
)
10751076
def test_auto_partition_respects_language_arg(file_extension: str):
10761077
elements = partition(
1077-
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
1078+
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"),
1079+
languages=["deu"],
10781080
)
10791081
assert all(element.metadata.languages == ["deu"] for element in elements)
10801082

10811083

1084+
def test_auto_partition_language_fallback_flows_through_call_chain():
1085+
"""Integration test: language_fallback must flow partition() -> partitioner -> apply_metadata
1086+
-> apply_lang_metadata -> detect_languages(). A fallback returning None yields no language.
1087+
"""
1088+
with tempfile.NamedTemporaryFile(
1089+
mode="w",
1090+
suffix=".txt",
1091+
delete=False,
1092+
encoding="utf-8",
1093+
) as f:
1094+
f.write("Hi.")
1095+
path = f.name
1096+
try:
1097+
elements = partition(filename=path, language_fallback=lambda t: None)
1098+
assert elements, "expected at least one element"
1099+
assert all(e.metadata.languages is None for e in elements), (
1100+
"language_fallback=lambda t: None should leave metadata.languages unset"
1101+
)
1102+
finally:
1103+
os.unlink(path)
1104+
1105+
10821106
# -- include_page_breaks --------------------------------------------------
10831107

10841108

test_unstructured/partition/test_md.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,15 @@ def test_partition_md_respects_detect_language_per_element():
248248
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
249249

250250

251+
def test_partition_md_languages_empty_disables_detection():
252+
"""Passing languages=[\"\"] disables language detection (no metadata.languages set)."""
253+
filename = "example-docs/README.md"
254+
elements = partition_md(filename=filename, languages=[""])
255+
# When detection is disabled, metadata.languages should not be set (None)
256+
for el in elements:
257+
assert el.metadata.languages is None
258+
259+
251260
def test_partition_md_parse_table():
252261
filename = example_doc_path("simple-table.md")
253262
elements = partition_md(filename=filename)

0 commit comments

Comments
 (0)