Skip to content

Commit

Permalink
structure repo for toolchain
Browse files Browse the repository at this point in the history
  • Loading branch information
DiTo97 committed Oct 17, 2023
1 parent b2afe93 commit 3d60fa0
Show file tree
Hide file tree
Showing 15 changed files with 888 additions and 547 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
ignore = E203, E266, E501, W503, F403, F401
max-complexity = 18
max-line-length = 88
select = B,C,E,F,W,T4,B9
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# cache
__pycache__/

# IDE
.vscode/

# PIP
**/*.egg-info/
23 changes: 23 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace

- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
name: isort (python)

- repo: https://github.com/psf/black
rev: 23.9.1
hooks:
- id: black

- repo: https://github.com/pycqa/flake8
rev: 6.1.0
hooks:
- id: flake8
Empty file added __init__.py
Empty file.
Empty file added augmentation/augmentation.py
Empty file.
Empty file added augmentation/requirements.txt
Empty file.
Empty file added benchmark/.gitkeep
Empty file.
47 changes: 47 additions & 0 deletions configs/deepspeed-zero2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"fp16": {
"enabled": "auto",
"hysteresis": 2,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"loss_scale": 0,
"min_loss_scale": 1
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"optimizer": {
"params": {
"betas": "auto",
"eps": "auto",
"lr": "auto",
"weight_decay": "auto"
},
"type": "AdamW"
},
"scheduler": {
"params": {
"warmup_max_lr": "auto",
"warmup_min_lr": "auto",
"warmup_num_steps": "auto"
},
"type": "WarmupLR"
},
"steps_per_print": 10,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"zero_optimization": {
"contiguous_gradients": true,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"reduce_bucket_size": "auto",
"stage": 2,
"sub_group_size": 1e9
}
}
120 changes: 120 additions & 0 deletions dataset/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import os
import sys
import tempfile
from typing import Any

import datasets
import numpy as np
from absl import app
from absl import flags
from PIL import Image
from tqdm.auto import tqdm


FLAGS = flags.FLAGS


flags.DEFINE_string("output-dir", os.getcwd(), "The DIBCO dataset dir")
flags.DEFINE_integer("seed", 0, "The random state seed")
flags.DEFINE_float("eval-size", 0.2, "The eval split size")
flags.DEFINE_float("test-size", 0.2, "The test split size")


def normalize(image: Image.Image) -> Image.Image:
image = image.convert("L")

array = np.array(image).astype(np.uint8)
condition = array < np.max(array)
array = np.where(condition, 1, 0).astype(bool)

image = Image.fromarray(array)
return image


def preprocessing(batch: dict[str, list[Any]]) -> dict[str, list[Any]]:
"""It prepares a batch of examples for semantic segmentation"""
sources = batch["source"]
targets = batch["target"]

batch = {
"labelmap": [normalize(Image.open(src)) for src in targets],
"pixelmap": [Image.open(src) for src in sources]
}

return batch


def main(argv):
del argv

URL = "https://github.com/Leedeng/SauvolaNet.git"

assert FLAGS.eval_size + FLAGS.test_size < 1.0, "The splits must sum to less than 1.0"

with tempfile.TemporaryDirectory() as sauvolanet:
os.system(f"git clone {URL} {sauvolanet}")

dataset_dir = os.path.join(sauvolanet, "Dataset")
src_dir = os.path.join(sauvolanet, "SauvolaDocBin")

sys.path.insert(0, src_dir)

from dataUtils import collect_binarization_by_dataset
collection = collect_binarization_by_dataset(dataset_dir)
del collect_binarization_by_dataset

sys.path.remove(src_dir)

del src_dir
del dataset_dir

features = datasets.Features({
"ensemble": datasets.Value("string"),
"source": datasets.Value("string"),
"target": datasets.Value("string"),
})

for key, examples in tqdm(collection.items(), desc="DIBCO benchmark"):
sources, targets = zip(*examples)

sources = sorted(sources)
targets = sorted(targets)

dataset = {"source": sources, "target": targets, "ensemble": [key] * len(sources)}
dataset = datasets.Dataset.from_dict(dataset, features)

collection[key] = dataset

collection = datasets.concatenate_datasets([
dataset for _, dataset in collection.items()
])

features = datasets.Features({
"ensemble": datasets.Value("string"),
"labelmap": datasets.Image(),
"pixelmap": datasets.Image(),
})

collection = collection.map(
preprocessing,
batched=True,
features=features,
remove_columns=["source", "target"]
)

collection = collection.class_encode_column("ensemble")

train_size = 1.0 - FLAGS.test_size

collection = collection.train_test_split(
seed=FLAGS.seed,
shuffle=True,
stratify_by_column="ensemble",
train_size=train_size
)

collection.save_to_disk(FLAGS.output_dir)


if __name__ == "__main__":
app.run(main)
3 changes: 3 additions & 0 deletions dataset/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
absl-py~=2.0.0
datasets~=2.14.5
pre-commit~=3.4.0
Loading

0 comments on commit 3d60fa0

Please sign in to comment.