pluggable ocr dataset options (#148)

goatrocks · web-flow · commit c63a15d7da99 · 2022-03-16T10:34:43.000-04:00
diff --git a/indico/queries/datasets.py b/indico/queries/datasets.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import json
+import jsons
 import tempfile
 from pathlib import Path
 from typing import List
@@ -13,9 +14,9 @@
     HTTPRequest,
     RequestChain,
 )
-from indico.errors import IndicoNotFound
+from indico.errors import IndicoNotFound, IndicoInputError
 from indico.queries.storage import UploadBatched, UploadImages
-from indico.types.dataset import Dataset
+from indico.types.dataset import Dataset, OcrEngine, OmnipageOcrOptionsInput, ReadApiOcrOptionsInput, OcrInputLanguage
 
 
 class ListDatasets(GraphQLRequest):
@@ -184,14 +185,17 @@ class CreateDataset(RequestChain):
     previous = None
 
     def __init__(
-        self,
-        name: str,
-        files: List[str],
-        wait: bool = True,
-        dataset_type: str = "TEXT",
-        from_local_images: bool = False,
-        image_filename_col: str = "filename",
-        batch_size: int = 20,
+            self,
+            name: str,
+            files: List[str],
+            wait: bool = True,
+            dataset_type: str = "TEXT",
+            from_local_images: bool = False,
+            image_filename_col: str = "filename",
+            batch_size: int = 20,
+            ocr_engine: OcrEngine = None,
+            omnipage_ocr_options: OmnipageOcrOptionsInput = None,
+            read_api_ocr_options: ReadApiOcrOptionsInput = None
     ):
         self.files = files
         self.name = name
@@ -200,6 +204,8 @@ def __init__(
         self.from_local_images = from_local_images
         self.image_filename_col = image_filename_col
         self.batch_size = batch_size
+        if omnipage_ocr_options is not None and read_api_ocr_options is not None:
+            raise IndicoInputError("Must supply either omnipage or readapi options but not both.")
         super().__init__()
 
     def requests(self):
@@ -235,7 +241,7 @@ def requests(self):
         yield GetDatasetFileStatus(id=dataset_id)
         debouncer = Debouncer()
         while not all(
-            f.status in ["DOWNLOADED", "FAILED"] for f in self.previous.files
+                f.status in ["DOWNLOADED", "FAILED"] for f in self.previous.files
         ):
             yield GetDatasetFileStatus(id=self.previous.id)
             debouncer.backoff()
@@ -250,7 +256,7 @@ def requests(self):
         debouncer = Debouncer()
         if self.wait is True:
             while not all(
-                [f.status in ["PROCESSED", "FAILED"] for f in self.previous.files]
+                    [f.status in ["PROCESSED", "FAILED"] for f in self.previous.files]
             ):
                 yield GetDatasetFileStatus(id=dataset_id)
                 debouncer.backoff()
@@ -295,20 +301,32 @@ def process_response(self, response):
 
 class CreateEmptyDataset(GraphQLRequest):
     query = """
-    mutation($name: String!, $datasetType: DatasetType) {
-        createDataset(name: $name, datasetType: $datasetType) {
+    mutation($name: String!, $datasetType: DatasetType, $config: DataConfigInput) {
+        createDataset(name: $name, datasetType: $datasetType, config: $config ) {
             id
             name
         }
     }
     """
 
-    def __init__(self, name: str, dataset_type: str = None):
+    def __init__(self, name: str, dataset_type: str = None, ocr_engine: OcrEngine = None,
+                 omnipage_ocr_options: OmnipageOcrOptionsInput = None,
+                 readapi_ocr_options: ReadApiOcrOptionsInput = None):
         if not dataset_type:
             dataset_type = "TEXT"
-
+        config = None
+        if ocr_engine is not None:
+            config = {
+                "ocrOptions": {
+                    "ocrEngine": ocr_engine.name,
+                    "omnipageOptions": omnipage_ocr_options,
+                    "readapiOptions": readapi_ocr_options
+                }
+            }
         super().__init__(
-            self.query, variables={"name": name, "datasetType": dataset_type}
+            self.query, variables={"name": name, "datasetType": dataset_type,
+                                   "config": jsons.dump(config, key_transformer=jsons.KEY_TRANSFORMER_CAMELCASE,
+                                                        strip_nulls=True)}
         )
 
     def process_response(self, response):
@@ -324,7 +342,6 @@ class _AddFiles(GraphQLRequest):
         }
     }
     """
-    
 
     def __init__(self, dataset_id: int, metadata: List[str]):
         super().__init__(
@@ -358,11 +375,11 @@ class AddFiles(RequestChain):
     previous = None
 
     def __init__(
-        self,
-        dataset_id: int,
-        files: List[str],
-        wait: bool = True,
-        batch_size: int = 20,
+            self,
+            dataset_id: int,
+            files: List[str],
+            wait: bool = True,
+            batch_size: int = 20,
     ):
         self.dataset_id = dataset_id
         self.files = files
@@ -380,8 +397,8 @@ def requests(self):
         yield GetDatasetFileStatus(id=self.dataset_id)
         debouncer = Debouncer()
         while not all(
-            f.status in ["DOWNLOADED", "FAILED", "PROCESSED"]
-            for f in self.previous.files
+                f.status in ["DOWNLOADED", "FAILED", "PROCESSED"]
+                for f in self.previous.files
         ):
             yield GetDatasetFileStatus(id=self.previous.id)
             debouncer.backoff()
@@ -448,10 +465,10 @@ class ProcessFiles(RequestChain):
     """
 
     def __init__(
-        self,
-        dataset_id: int,
-        datafile_ids: List[int],
-        wait: bool = True,
+            self,
+            dataset_id: int,
+            datafile_ids: List[int],
+            wait: bool = True,
     ):
         self.dataset_id = dataset_id
         self.datafile_ids = datafile_ids
@@ -463,7 +480,7 @@ def requests(self):
         yield GetDatasetFileStatus(id=self.dataset_id)
         if self.wait:
             while not all(
-                f.status in ["PROCESSED", "FAILED"] for f in self.previous.files
+                    f.status in ["PROCESSED", "FAILED"] for f in self.previous.files
             ):
                 yield GetDatasetFileStatus(id=self.dataset_id)
                 debouncer.backoff()
@@ -497,7 +514,36 @@ def requests(self):
         yield GetDatasetFileStatus(id=self.dataset_id)
         if self.wait:
             while not all(
-                f.status in ["PROCESSED", "FAILED"] for f in self.previous.files
+                    f.status in ["PROCESSED", "FAILED"] for f in self.previous.files
             ):
                 yield GetDatasetFileStatus(id=self.dataset_id)
                 debouncer.backoff()
+
+
+class GetOcrEngineLanguageCodes(GraphQLRequest):
+    """
+    Fetches and lists the available languages by name and code for the given OCR Engine
+
+    Args:
+        ocr_engine(OcrEngine): The engine to fetch for.
+    """
+    query = """query{
+        ocrOptions {
+            engines{
+            name
+            languages {
+                name
+                code
+                }
+            }
+        }
+    }"""
+
+    def __init__(self, engine: OcrEngine):
+        self.engine = engine
+        super().__init__(self.query)
+
+    def process_response(self, response):
+        data = super().process_response(response)["ocrOptions"]["engines"]
+        engine_laguages = next(x["languages"] for x in data if x["name"] == self.engine.name)
+        return [OcrInputLanguage(**option) for option in engine_laguages]
diff --git a/indico/types/dataset.py b/indico/types/dataset.py
@@ -1,3 +1,4 @@
+from enum import Enum
 from typing import List
 
 from indico.types.base import BaseType
@@ -58,3 +59,69 @@ def labelset_by_name(self, name: str) -> LabelSet:
 
     def datacolumn_by_name(self, name: str) -> DataColumn:
         return next(l for l in self.datacolumns if l.name == name)
+
+
+class TableReadOrder(Enum):
+    ROW = 0
+    COLUMN = 1
+
+class OcrEngine(Enum):
+    """
+    Enum representing available OCR engines.
+    """
+    OMNIPAGE = 0
+    READAPI = 1
+    pass
+
+class OmnipageOcrOptionsInput(BaseType):
+    """
+    Omnipage specific OCR options for dataset creation.
+
+    Args:
+        auto_rotate(bool): auto rotate.
+        single_colum(bool): Read table as a single column.
+        upscale_images(bool): Scale up low-resolution images.
+        languages(List[OmnipageLanguageCode]): List of languages to use in ocr.
+        cells(bool): Return table information for post-processing rules
+        force_render(bool): Force rednering.
+        native_layout(bool): Native layout.
+        native_pdf(bool): Native pdf.
+        table_read_order(TableReadOrder): Read table by row or column.
+
+    """
+    auto_rotate: bool
+    single_column: bool
+    upscale_images: bool
+    languages: List[str]
+    cells: bool
+    force_render: bool
+    native_layout: bool
+    native_pdf: bool
+    table_read_order: TableReadOrder
+
+class ReadApiOcrOptionsInput(BaseType):
+    """
+    Read API OCR options.
+
+    Args:
+        auto_rotate(bool): Auto rotate
+        single_column(bool): Read table as a single column.
+        upscale_images(bool): Scale up low resolution images.
+        languages(List[str]): List of languages to use.
+    """
+    auto_rotate: bool
+    single_column: bool
+    upscale_images: bool
+    languages: List[str]
+
+class OcrInputLanguage(BaseType):
+    name: str
+    code: str
+
+class OcrOptionsInput():
+    """
+    Input options for OCR engine.
+    """
+    ocr_engine: OcrEngine
+    omnipage_options: OmnipageOcrOptionsInput
+    readapi_options: ReadApiOcrOptionsInput
diff --git a/tests/integration/queries/test_dataset.py b/tests/integration/queries/test_dataset.py
@@ -17,7 +17,7 @@
     ProcessCSV,
 )
 from indico.queries.export import CreateExport, DownloadExport
-from indico.types.dataset import Dataset
+from indico.types.dataset import Dataset, OmnipageOcrOptionsInput, TableReadOrder, OcrEngine
 from indico.errors import IndicoRequestError
 from tests.integration.data.datasets import airlines_dataset
 
@@ -175,6 +175,24 @@ def _dataset_complete(dataset):
     assert dataset.status == "COMPLETE"
 
 
+def test_create_with_options(indico):
+    client = IndicoClient()
+    config: OmnipageOcrOptionsInput = {
+        "auto_rotate": True,
+        "single_column": True,
+        "upscale_images": True,
+        "languages": ["ENG", "FIN"],
+        "force_render": False,
+        "native_layout": False,
+        "native_pdf": False,
+        "table_read_order": TableReadOrder.ROW
+    }
+    dataset = client.call(CreateEmptyDataset(name=f"dataset-{int(time.time())}", ocr_engine=OcrEngine.OMNIPAGE,
+                                             omnipage_ocr_options=config))
+
+
+
+
 def test_create_from_files_document(indico):
     client = IndicoClient()
     dataset = client.call(CreateEmptyDataset(name=f"dataset-{int(time.time())}"))