structure repo for toolchain

DiTo97 · Oct 17, 2023 · 3d60fa0 · 3d60fa0
1 parent b2afe93
commit 3d60fa0
Show file tree

Hide file tree

Showing 15 changed files with 888 additions and 547 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E203, E266, E501, W503, F403, F401
+max-complexity = 18
+max-line-length = 88
+select = B,C,E,F,W,T4,B9
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+# cache
+__pycache__/
+
+# IDE
+.vscode/
+
+# PIP
+**/*.egg-info/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,23 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+
+  - repo: https://github.com/psf/black
+    rev: 23.9.1
+    hooks:
+      - id: black
+
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
diff --git a/__init__.py b/__init__.py
diff --git a/augmentation/augmentation.py b/augmentation/augmentation.py
diff --git a/augmentation/requirements.txt b/augmentation/requirements.txt
diff --git a/benchmark/.gitkeep b/benchmark/.gitkeep
diff --git a/configs/deepspeed-zero2.json b/configs/deepspeed-zero2.json
@@ -0,0 +1,47 @@
+{
+  "fp16": {
+    "enabled": "auto",
+    "hysteresis": 2,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "loss_scale": 0,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "optimizer": {
+    "params": {
+      "betas": "auto",
+      "eps": "auto",
+      "lr": "auto",
+      "weight_decay": "auto"
+    },
+    "type": "AdamW"
+  },
+  "scheduler": {
+    "params": {
+      "warmup_max_lr": "auto",
+      "warmup_min_lr": "auto",
+      "warmup_num_steps": "auto"
+    },
+    "type": "WarmupLR"
+  },
+  "steps_per_print": 10,
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "contiguous_gradients": true,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "reduce_bucket_size": "auto",
+    "stage": 2,
+    "sub_group_size": 1e9
+  }
+}
diff --git a/dataset/__main__.py b/dataset/__main__.py
@@ -0,0 +1,120 @@
+import os
+import sys
+import tempfile
+from typing import Any
+
+import datasets
+import numpy as np
+from absl import app
+from absl import flags
+from PIL import Image
+from tqdm.auto import tqdm
+
+
+FLAGS = flags.FLAGS
+
+
+flags.DEFINE_string("output-dir", os.getcwd(), "The DIBCO dataset dir")
+flags.DEFINE_integer("seed", 0, "The random state seed")
+flags.DEFINE_float("eval-size", 0.2, "The eval split size")
+flags.DEFINE_float("test-size", 0.2, "The test split size")
+
+
+def normalize(image: Image.Image) -> Image.Image:
+    image = image.convert("L")
+
+    array = np.array(image).astype(np.uint8)
+    condition = array < np.max(array)
+    array = np.where(condition, 1, 0).astype(bool)
+
+    image = Image.fromarray(array)
+    return image
+
+
+def preprocessing(batch: dict[str, list[Any]]) -> dict[str, list[Any]]:
+    """It prepares a batch of examples for semantic segmentation"""
+    sources = batch["source"]
+    targets = batch["target"]
+
+    batch = {
+        "labelmap": [normalize(Image.open(src)) for src in targets],
+        "pixelmap": [Image.open(src) for src in sources]
+    }
+
+    return batch
+
+
+def main(argv):
+    del argv
+
+    URL = "https://github.com/Leedeng/SauvolaNet.git"
+
+    assert FLAGS.eval_size + FLAGS.test_size < 1.0, "The splits must sum to less than 1.0"
+
+    with tempfile.TemporaryDirectory() as sauvolanet:
+        os.system(f"git clone {URL} {sauvolanet}")
+
+        dataset_dir = os.path.join(sauvolanet, "Dataset")
+        src_dir = os.path.join(sauvolanet, "SauvolaDocBin")
+
+        sys.path.insert(0, src_dir)
+
+        from dataUtils import collect_binarization_by_dataset
+        collection = collect_binarization_by_dataset(dataset_dir)
+        del collect_binarization_by_dataset
+
+        sys.path.remove(src_dir)
+
+        del src_dir
+        del dataset_dir
+
+        features = datasets.Features({
+            "ensemble": datasets.Value("string"),
+            "source": datasets.Value("string"),
+            "target": datasets.Value("string"),
+        })
+
+        for key, examples in tqdm(collection.items(), desc="DIBCO benchmark"):
+            sources, targets = zip(*examples)
+
+            sources = sorted(sources)
+            targets = sorted(targets)
+
+            dataset = {"source": sources, "target": targets, "ensemble": [key] * len(sources)}
+            dataset = datasets.Dataset.from_dict(dataset, features)
+
+            collection[key] = dataset
+
+        collection = datasets.concatenate_datasets([
+            dataset for _, dataset in collection.items()
+        ])
+
+        features = datasets.Features({
+            "ensemble": datasets.Value("string"),
+            "labelmap": datasets.Image(),
+            "pixelmap": datasets.Image(),
+        })
+
+        collection = collection.map(
+            preprocessing, 
+            batched=True,
+            features=features, 
+            remove_columns=["source", "target"]
+        )
+
+        collection = collection.class_encode_column("ensemble")
+
+        train_size = 1.0 - FLAGS.test_size
+
+        collection = collection.train_test_split(
+            seed=FLAGS.seed,
+            shuffle=True,
+            stratify_by_column="ensemble",
+            train_size=train_size
+        )
+
+        collection.save_to_disk(FLAGS.output_dir)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/dataset/requirements.txt b/dataset/requirements.txt
@@ -0,0 +1,3 @@
+absl-py~=2.0.0
+datasets~=2.14.5
+pre-commit~=3.4.0