Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add "auto" language for TesseractOcr #759

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 57 additions & 13 deletions docling/models/tesseract_ocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions):

self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.script_readers = None

if self.enabled:
install_errmsg = (
Expand Down Expand Up @@ -54,21 +55,33 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions):
# Initialize the tesseractAPI
_log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang)

tesserocr_kwargs = {
"psm": tesserocr.PSM.AUTO,
"init": True,
"oem": tesserocr.OEM.DEFAULT,
}

if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path

if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI(
path=self.options.path,
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)

self.script_readers = {}
scripts = [l for l in tesserocr_languages if l.startswith("script")]

for script in scripts:
self.script_readers[script] = tesserocr.PyTessBaseAPI(
**{"lang": script} | tesserocr_kwargs,
)
nikos-livathinos marked this conversation as resolved.
Show resolved Hide resolved
else:
self.reader = tesserocr.PyTessBaseAPI(
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
**{"lang": lang} | tesserocr_kwargs,
)

self.reader_RIL = tesserocr.RIL

def __del__(self):
Expand Down Expand Up @@ -106,20 +119,51 @@ def __call__(

# Retrieve text snippets with their bounding boxes
self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(

if self.script_readers is not None:
osd = self.reader.DetectOrientationScript()

# No text, probably
if osd is None:
continue

script = osd["script_name"]

if script == "Katakana" or script == "Hiragana":
script = "Japanese"
elif script == "Han":
script = "HanS"
elif script == "Korean":
script = "Hangul"
nikos-livathinos marked this conversation as resolved.
Show resolved Hide resolved

if f"script/{script}" in self.script_readers:
_log.debug(
f'Using model for the detected script "{script}"'
)
local_reader = self.script_readers[f"script/{script}"]
local_reader.SetImage(high_res_image)
else:
_log.warning(
f'No model for the detected script "{script}"'
)
continue
else:
local_reader = self.reader

boxes = local_reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True
)

cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(
local_reader.SetRectangle(
box["x"], box["y"], box["w"], box["h"]
)

# Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
text = local_reader.GetUTF8Text().strip()
confidence = local_reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
Expand Down
5 changes: 4 additions & 1 deletion tests/test_e2e_ocr_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def test_e2e_conversions():
RapidOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True),
RapidOcrOptions(force_full_page_ocr=True),
]
Expand All @@ -70,7 +71,9 @@ def test_e2e_conversions():
engines.append(OcrMacOptions(force_full_page_ocr=True))

for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}")
print(
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
)
converter = get_converter(ocr_options=ocr_options)
for pdf_path in pdf_paths:
print(f"converting {pdf_path}")
Expand Down
Loading