File tree Expand file tree Collapse file tree 2 files changed +36
-0
lines changed Expand file tree Collapse file tree 2 files changed +36
-0
lines changed Original file line number Diff line number Diff line change
1
+ from pathlib import Path
2
+
3
+ from docling .datamodel .base_models import InputFormat
4
+ from docling .datamodel .pipeline_options import (
5
+ PdfPipelineOptions ,
6
+ TesseractCliOcrOptions ,
7
+ TesseractOcrOptions ,
8
+ )
9
+ from docling .document_converter import DocumentConverter , PdfFormatOption
10
+
11
+
12
+ def main ():
13
+ input_doc = Path ("./tests/data/2206.01062.pdf" )
14
+
15
+ # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
16
+ # ocr_options = TesseractOcrOptions(lang=["auto"])
17
+ ocr_options = TesseractCliOcrOptions (lang = ["auto" ])
18
+
19
+ pipeline_options = PdfPipelineOptions (do_ocr = True , ocr_options = ocr_options )
20
+
21
+ converter = DocumentConverter (
22
+ format_options = {
23
+ InputFormat .PDF : PdfFormatOption (
24
+ pipeline_options = pipeline_options ,
25
+ )
26
+ }
27
+ )
28
+
29
+ doc = converter .convert (input_doc ).document
30
+ md = doc .export_to_markdown ()
31
+ print (md )
32
+
33
+
34
+ if __name__ == "__main__" :
35
+ main ()
Original file line number Diff line number Diff line change 75
75
- " Table export " : examples/export_tables.py
76
76
- " Multimodal export " : examples/export_multimodal.py
77
77
- " Force full page OCR " : examples/full_page_ocr.py
78
+ - " Automatic OCR language detection with tesseract " : examples/tesseract_lang_detection.py
78
79
- " Accelerator options " : examples/run_with_accelerator.py
79
80
- " Simple translation " : examples/translate.py
80
81
- ✂️ Chunking :
You can’t perform that action at this time.
0 commit comments