From 5d60b0681c3ae8abcba65e31ef433c04913a8fb7 Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Tue, 12 Sep 2023 23:38:40 +0200
Subject: [PATCH 1/8] Added option to specify category_attribute (defaults to
 'category_id'). Also bumped package version.

---
 geococo/coco_processing.py | 6 ++++--
 pyproject.toml             | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index 3cfdc10..3406ad4 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -27,6 +27,7 @@ def labels_to_dataset(
     src: DatasetReader,
     labels: gpd.GeoDataFrame,
     window_bounds: List[Tuple[int, int]],
+    category_attribute: str = "category_id"
 ) -> CocoDataset:
     """Move across a given geotiff, converting all intersecting labels to COCO
     annotations and appending them to a COCODataset model. This is done through
@@ -50,6 +51,7 @@ def labels_to_dataset(
     :param labels: GeoDataFrame containing labels and class_info
         ('category_id')
     :param window_bounds: a list of window_bounds to attempt to use ()
+    :param category_attribute: Column containing category_id values
     :return: The COCO dataset with appended Images and Annotations
     """
 
@@ -129,7 +131,7 @@ def labels_to_dataset(
 
         # Iteratively add Annotation models to dataset (also bumps next_annotation_id)
         with rasterio.open(window_image_path) as windowed_src:
-            for _, window_label in window_labels.sort_values("category_id").iterrows():
+            for _, window_label in window_labels.sort_values(category_attribute).iterrows():
                 label_mask = mask_label(
                     input_raster=windowed_src, label=window_label.geometry
                 )
@@ -144,7 +146,7 @@ def labels_to_dataset(
                 annotation_instance = Annotation(
                     id=dataset.next_annotation_id,
                     image_id=dataset.next_image_id,
-                    category_id=window_label["category_id"],
+                    category_id=window_label[category_attribute],
                     segmentation=rle,  # type: ignore
                     area=area,
                     bbox=bounding_box,
diff --git a/pyproject.toml b/pyproject.toml
index 0f74671..579a0ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "geococo"
-version = "0.2.1"
+version = "0.3.0"
 description = "Converts GIS annotations to Microsoft's Common Objects In Context (COCO) dataset format"
 authors = ["Jasper <j.siebring92@gmail.com>"]
 readme = "README.md"

From 4e652c70e81f24c24b12605521b9939f7da33513 Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Wed, 13 Sep 2023 00:32:30 +0200
Subject: [PATCH 2/8] Dropped 'Planned features' ref for brevity sake (there's
 too many planned features this early in development and I don't want to
 update it every release)

---
 README.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/README.md b/README.md
index bc36288..5e1be37 100644
--- a/README.md
+++ b/README.md
@@ -158,8 +158,3 @@ session = fo.launch_app(coco_dataset, port=5151)
   <img src="https://github.com/jaspersiebring/GeoCOCO/assets/25051531/f8ab55da-b3cd-4beb-b082-7946e712ea5c" width="45%" height = 250/>
   <img src="https://github.com/jaspersiebring/GeoCOCO/assets/25051531/9a796a54-ffc2-49c3-95bc-59e5c0dd1d7c" width="45%" height = 250 />
 </p>
-
-
-# Planned features
-- [QGIS plugin](https://github.com/jaspersiebring/geococo-qgis-plugin).
-- Data visualization with `pycocotool`'s plotting functionality

From c26527f8c3f1f05e07c51a5d2c44393fd90457dc Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Wed, 13 Sep 2023 00:59:46 +0200
Subject: [PATCH 3/8] labels_to_dataset now also adds Category instances to the
 COCO dataset. The ids for these instances are set by a category_mapper which
 is built from any existing categories in said dataset.

---
 geococo/coco_models.py     | 28 +++++++++++++++++++++++++++-
 geococo/coco_processing.py | 10 +++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 6d38e98..69e93ff 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -2,8 +2,9 @@
 import numpy as np
 import pathlib
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union, Dict
 from typing_extensions import TypedDict
+from numpy.typing import ArrayLike
 
 from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
 from semver.version import Version
@@ -18,14 +19,21 @@ class CocoDataset(BaseModel):
     _next_image_id: int = 1
     _next_annotation_id: int = 1
     _next_source_id: int = 1
+    _category_mapper: Dict = {}
 
     @model_validator(mode="after")
     def _set_ids(self) -> CocoDataset:
         self._next_image_id = len(self.images) + 1
         self._next_annotation_id = len(self.annotations) + 1
         self._next_source_id = len(self.sources)
+        self._category_mapper = self._get_category_mapper()
         return self
 
+    def _get_category_mapper(self) -> Dict:
+        category_data = [(category.name, category.id) for category in self.categories]
+        category_mapper = dict(category_data) if category_data else {}
+        return category_mapper
+
     def add_annotation(self, annotation: Annotation) -> None:
         self.annotations.append(annotation)
         self._next_annotation_id += 1
@@ -47,6 +55,24 @@ def add_source(self, source_path: pathlib.Path) -> None:
 
         self._next_source_id = source.id
 
+    def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> None:
+        # filtering existing categories 
+        category_mask = np.isin(categories, self._category_mapper.keys())
+        new_categories = categories[~category_mask]
+
+        # generating mapper from new categories
+        start = len(self._category_mapper.values()) + 1
+        end = start + new_categories.size
+        category_dict = dict(zip(new_categories, np.arange(start, end)))
+
+        # instance and append new Category objects to dataset
+        for category_name, category_id in category_dict.items():
+            category = Category(id = category_id, name = category_name, supercategory="1")
+            self.categories.append(category)
+
+        # update existing category_mapper with new categories
+        self._category_mapper.update(category_dict)
+
     def bump_version(self, bump_method: str) -> None:
         bump_methods = ["patch", "minor", "major"]
         version = Version.parse(self.info.version)
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index 3406ad4..d0205dd 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -54,7 +54,7 @@ def labels_to_dataset(
     :param category_attribute: Column containing category_id values
     :return: The COCO dataset with appended Images and Annotations
     """
-
+    
     # Setting nodata and estimating window configuration
     parent_window = window_intersect(input_raster=src, input_vector=labels)
     nodata_value = src.nodata if src.nodata else 0
@@ -68,6 +68,9 @@ def labels_to_dataset(
     
     # bumps major version if images_dir has been used in this dataset before
     dataset.verify_new_output_dir(images_dir=images_dir)
+
+    # sets dataset._category_mapper
+    dataset.add_categories(categories=labels[category_attribute].unique())
     
     for child_window in tqdm(
         window_factory(parent_window=parent_window, schema=schema), total=n_windows
@@ -142,11 +145,12 @@ def labels_to_dataset(
                 bounding_box = cv2.boundingRect(label_mask.astype(np.uint8))
                 area = np.sum(label_mask)
                 iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0
-
+                category_id= dataset._category_mapper[window_label[category_attribute]]
+                
                 annotation_instance = Annotation(
                     id=dataset.next_annotation_id,
                     image_id=dataset.next_image_id,
-                    category_id=window_label[category_attribute],
+                    category_id=category_id,
                     segmentation=rle,  # type: ignore
                     area=area,
                     bbox=bounding_box,

From 4dd4fd440ce5dd6fa2149e49e300c2d43be6b8f8 Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Wed, 13 Sep 2023 01:12:20 +0200
Subject: [PATCH 4/8] Added class_names attribute with string values to
 gpd.GeoDataFrame objects returned by fixtures.

---
 tests/conftest.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 8abfcf5..fb921db 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -77,6 +77,8 @@ def overlapping_labels() -> gpd.GeoDataFrame:
 
     crs = CRS.from_epsg(3857)
     classes = [1, 2, 2, 5, 5]
+    class_names = ["One", "Two", "Two", "Five", "Five"]
+
     points = [
         Point(10, -10),
         Point(30, -30),
@@ -88,7 +90,7 @@ def overlapping_labels() -> gpd.GeoDataFrame:
     polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
 
     labels = gpd.GeoDataFrame(
-        geometry=polygons, data={"category_id": classes}, crs=crs
+        geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
     )  # type: ignore
     return labels
 
@@ -100,6 +102,7 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame:
 
     crs = CRS.from_epsg(3857)
     classes = [1, 2, 2, 5, 5]
+    class_names = ["One", "Two", "Two", "Five", "Five"]
     points = [
         Point(510, -510),
         Point(530, -530),
@@ -111,7 +114,7 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame:
     polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
 
     labels = gpd.GeoDataFrame(
-        geometry=polygons, data={"category_id": classes}, crs=crs
+        geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
     )  # type: ignore
     return labels
 

From db3e8f0c2bb05173493a0cd84ef6fd88f354a0af Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Wed, 13 Sep 2023 03:26:36 +0200
Subject: [PATCH 5/8] Added and refactored tests related to category_attribute
 values.

---
 geococo/coco_models.py        | 11 +++++++----
 geococo/coco_processing.py    |  3 ++-
 geococo/utils.py              | 25 +++++++++++++++++++++++++
 tests/test_coco_models.py     | 29 ++++++++++++++++++++++++++++-
 tests/test_coco_processing.py | 18 ++++++++++--------
 tests/test_utils.py           | 30 ++++++++++++++++++++++++++++++
 6 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 69e93ff..77c2578 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -8,9 +8,9 @@
 
 from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
 from semver.version import Version
+from geococo.utils import assert_valid_categories
 
-
-class CocoDataset(BaseModel):
+class CocoDataset(BaseModel):    
     info: Info
     images: List[InstanceOf[Image]] = []
     annotations: List[InstanceOf[Annotation]] = []
@@ -55,7 +55,10 @@ def add_source(self, source_path: pathlib.Path) -> None:
 
         self._next_source_id = source.id
 
-    def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> None:
+    def add_categories(self, categories: np.ndarray) -> None:
+        # checking if categories are castable to str and under a certain size
+        categories = assert_valid_categories(categories=np.unique(categories))
+        
         # filtering existing categories 
         category_mask = np.isin(categories, self._category_mapper.keys())
         new_categories = categories[~category_mask]
@@ -67,7 +70,7 @@ def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> N
 
         # instance and append new Category objects to dataset
         for category_name, category_id in category_dict.items():
-            category = Category(id = category_id, name = category_name, supercategory="1")
+            category = Category(id = category_id, name = str(category_name), supercategory="1")
             self.categories.append(category)
 
         # update existing category_mapper with new categories
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index d0205dd..3d0b7bc 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -145,7 +145,8 @@ def labels_to_dataset(
                 bounding_box = cv2.boundingRect(label_mask.astype(np.uint8))
                 area = np.sum(label_mask)
                 iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0
-                category_id= dataset._category_mapper[window_label[category_attribute]]
+                category_name = str(window_label[category_attribute])
+                category_id = dataset._category_mapper[category_name]
                 
                 annotation_instance = Annotation(
                     id=dataset.next_annotation_id,
diff --git a/geococo/utils.py b/geococo/utils.py
index 78f2d9e..10e2e8c 100644
--- a/geococo/utils.py
+++ b/geococo/utils.py
@@ -241,3 +241,28 @@ def estimate_schema(
         ) from last_exception
 
     return schema
+
+
+def assert_valid_categories(categories: np.ndarray, max_dtype: str = "<U50") -> np.ndarray:
+    """
+    Checks if all elements in categories array can be represented by strings 
+    of a certain length (defaults to <U50)
+
+    :param categories: numpy array containing category values
+    :param max_dtype: numpy str dtype with char size
+    """
+
+    # checking if categories is castable to str (a prerequisite for class_names)
+    if not isinstance(categories, np.ndarray):
+        raise ValueError("Categories needs to be of type np.ndarray")
+
+    try:
+        str_categories = categories.astype(str)
+    except Exception as e:
+        raise ValueError("Category values need to be castable to str") from e
+    
+    # checking if categories can be castable to str of a certain length (e.g. <U50)
+    if not np.can_cast(str_categories, max_dtype):
+        raise ValueError(f"Category values (str) have to fit in {max_dtype}")
+    
+    return str_categories.astype(max_dtype)
\ No newline at end of file
diff --git a/tests/test_coco_models.py b/tests/test_coco_models.py
index a0fdd76..12aa3e7 100644
--- a/tests/test_coco_models.py
+++ b/tests/test_coco_models.py
@@ -2,6 +2,8 @@
 import numpy as np
 import pathlib
 from datetime import datetime
+import geopandas as gpd
+import pytest
 from geococo.coco_models import (
     CocoDataset,
     Info,
@@ -144,7 +146,7 @@ def test_dataset_add_sources():
     assert dataset.next_source_id == 2
 
 
-def test_dataset_versions(tmp_path: pathlib.Path):
+def test_dataset_versions():
     """Checks proper incrementation of dataset versions"""
 
     dataset = CocoDataset(info=Info())
@@ -161,3 +163,28 @@ def test_dataset_versions(tmp_path: pathlib.Path):
     # major bump: if new output_dir
     dataset.verify_new_output_dir(images_dir=pathlib.Path("b"))
     assert dataset.info.version == "1.0.0"
+
+def test_add_categories():
+    """Checks independent mapping of category_attribute to class_ids"""
+
+    dataset = CocoDataset(info=Info())
+    assert dataset.categories == []
+    assert dataset._category_mapper == {}
+    
+    # adding three unique classes
+    categories = np.array(["A", "B", "B", "E", "E"])
+    dataset.add_categories(categories=categories)
+
+    # checking length and sequential category_ids
+    assert np.unique(categories).size == len(dataset._category_mapper)
+    assert np.unique(categories).size == len(dataset.categories)
+    assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
+    
+    #check if existing key value pairs don't change 
+    # done by adding a bunch of existing classes and one new one
+    initial_mapper = dataset._category_mapper.copy()
+    categories = np.array(["One", "Two", "Two", "Five", "Five", "Six", "Six"])
+    dataset.add_categories(categories=categories)
+    subset_mapper = {key: value for key, value in dataset._category_mapper.items() if key in initial_mapper.keys()}
+    assert initial_mapper == subset_mapper
+    assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
\ No newline at end of file
diff --git a/tests/test_coco_processing.py b/tests/test_coco_processing.py
index 97d7ef2..ca814bc 100644
--- a/tests/test_coco_processing.py
+++ b/tests/test_coco_processing.py
@@ -13,6 +13,7 @@ def test_labels_to_dataset_new_dataset(
     overlapping_labels: gpd.GeoDataFrame,
 ) -> None:
     json_path = tmp_path / "dataset.json"
+    category_attribute = "category_id"
 
     with rasterio.open(test_raster) as raster_source:
         # Creating empty CocoDataset as input for labels_to_dataset
@@ -24,13 +25,13 @@ def test_labels_to_dataset_new_dataset(
             src=raster_source,
             labels=overlapping_labels,
             window_bounds=[(256, 256)],
+            category_attribute=category_attribute
         )
 
         # Checking if output has correct classes
-        unique_ann_ids = np.unique([ann.category_id for ann in dataset.annotations])
-        assert np.all(
-            np.isin(unique_ann_ids, overlapping_labels["category_id"].unique())
-        )
+        dataset_class_names =  np.array([cat.name for cat in dataset.categories])
+        labels_class_names = overlapping_labels[category_attribute].unique().astype(str)
+        assert np.all(np.isin(dataset_class_names, labels_class_names))
 
         # Dumping to JSON
         dst_json_data = dataset.model_dump_json()
@@ -51,6 +52,7 @@ def test_labels_to_dataset_append_dataset(
     test_raster: pathlib.Path,
     overlapping_labels: gpd.GeoDataFrame,
 ) -> None:
+    category_attribute = "category_id"
     with rasterio.open(test_raster) as raster_source:
         # Creating empty CocoDataset as input for labels_to_dataset
         info = Info(version="0.0.1", date_created=datetime.now())
@@ -61,13 +63,13 @@ def test_labels_to_dataset_append_dataset(
             src=raster_source,
             labels=overlapping_labels,
             window_bounds=[(256, 256)],
+            category_attribute=category_attribute
         )
 
         # Checking if output has correct classes
-        unique_ann_ids = np.unique([ann.category_id for ann in dataset.annotations])
-        assert np.all(
-            np.isin(unique_ann_ids, overlapping_labels["category_id"].unique())
-        )
+        dataset_class_names =  np.array([cat.name for cat in dataset.categories])
+        labels_class_names = overlapping_labels[category_attribute].unique().astype(str)
+        assert np.all(np.isin(dataset_class_names, labels_class_names))
 
         # Rerunning with existing CocoDataset to verify append
         previous_dataset = dataset.copy(deep=True)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 81d1400..f8f8862 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,9 +12,11 @@
     estimate_average_bounds,
     estimate_schema,
     mask_label,
+    assert_valid_categories
 )
 from geococo.window_schema import WindowSchema
 import geopandas as gpd
+from string import ascii_lowercase
 from typing import Tuple
 
 
@@ -280,3 +282,31 @@ def test_window_factory_boundless() -> None:
     )
     assert np.any(window_extents[:, 0] >= window.width)
     assert np.any(window_extents[:, 1] >= window.height)
+
+
+
+def test_assert_valid_categories() -> None:
+    # almost all python objects can be represented by str so we just try casting and verify char length
+    category_lengths = [10, 49, 50]
+    random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths]
+    random_words = np.array(random_words)
+    
+    _ = assert_valid_categories(random_words)
+
+    # float64 
+    random_numbers = np.random.randn(3).astype(np.float64)
+    _ = assert_valid_categories(random_numbers)
+
+    # longer than <U50
+    category_lengths = [51, 70, 120]
+    random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths]
+    random_words = np.array(random_words)
+
+    with pytest.raises(ValueError):
+        _ = assert_valid_categories(random_words)
+
+    
+
+
+
+    
\ No newline at end of file

From b9a1f3bf9161997180759824bddddfc67ab4bab7 Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Wed, 13 Sep 2023 03:33:45 +0200
Subject: [PATCH 6/8] Mypy fixes

---
 geococo/coco_models.py | 2 +-
 tests/test_utils.py    | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 77c2578..603dee6 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -60,7 +60,7 @@ def add_categories(self, categories: np.ndarray) -> None:
         categories = assert_valid_categories(categories=np.unique(categories))
         
         # filtering existing categories 
-        category_mask = np.isin(categories, self._category_mapper.keys())
+        category_mask = np.isin(categories, list(self._category_mapper.keys()))
         new_categories = categories[~category_mask]
 
         # generating mapper from new categories
diff --git a/tests/test_utils.py b/tests/test_utils.py
index f8f8862..3cf79d4 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -288,8 +288,7 @@ def test_window_factory_boundless() -> None:
 def test_assert_valid_categories() -> None:
     # almost all python objects can be represented by str so we just try casting and verify char length
     category_lengths = [10, 49, 50]
-    random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths]
-    random_words = np.array(random_words)
+    random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths])
     
     _ = assert_valid_categories(random_words)
 
@@ -299,8 +298,7 @@ def test_assert_valid_categories() -> None:
 
     # longer than <U50
     category_lengths = [51, 70, 120]
-    random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths]
-    random_words = np.array(random_words)
+    random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths])
 
     with pytest.raises(ValueError):
         _ = assert_valid_categories(random_words)

From 006cd2b0667791a074594ed90f2e5abe2cc7f780 Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Wed, 13 Sep 2023 03:34:41 +0200
Subject: [PATCH 7/8] Dropped legacy env.yml

---
 full_environment.yml | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 full_environment.yml

diff --git a/full_environment.yml b/full_environment.yml
deleted file mode 100644
index b90fa2d..0000000
--- a/full_environment.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: geococo_dev
-channels:
-  - conda-forge
-  - fastai
-dependencies:
-  - python
-  - geopandas
-  - rasterio
-  - pydantic
-  - opencv-python-headless
-  - pycocotools
-  - tqdm
-  - pytest
-  - mypy
-  - black
-  - ruff
-  - pip
-  - pip:
-    - fiftyone
-platforms:
-  - linux-64
-  - win-64
\ No newline at end of file

From 707154b40df7a866d5552f93ecf57542af456c85 Mon Sep 17 00:00:00 2001
From: Jasper <j.siebring92@gmail.com>
Date: Wed, 13 Sep 2023 03:41:29 +0200
Subject: [PATCH 8/8] Formatters (black, docformatter, ruff)

---
 geococo/coco_models.py     | 14 +++---
 geococo/coco_processing.py | 45 +++++++++---------
 geococo/utils.py           | 93 +++++++++++++++++---------------------
 tests/conftest.py          |  8 +++-
 tests/test_coco_models.py  | 30 ++++++------
 tests/test_utils.py        | 32 +++++++------
 6 files changed, 111 insertions(+), 111 deletions(-)

diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 603dee6..ee04052 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -2,15 +2,15 @@
 import numpy as np
 import pathlib
 from datetime import datetime
-from typing import List, Optional, Union, Dict
+from typing import List, Optional, Dict
 from typing_extensions import TypedDict
-from numpy.typing import ArrayLike
 
 from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
 from semver.version import Version
 from geococo.utils import assert_valid_categories
 
-class CocoDataset(BaseModel):    
+
+class CocoDataset(BaseModel):
     info: Info
     images: List[InstanceOf[Image]] = []
     annotations: List[InstanceOf[Annotation]] = []
@@ -58,8 +58,8 @@ def add_source(self, source_path: pathlib.Path) -> None:
     def add_categories(self, categories: np.ndarray) -> None:
         # checking if categories are castable to str and under a certain size
         categories = assert_valid_categories(categories=np.unique(categories))
-        
-        # filtering existing categories 
+
+        # filtering existing categories
         category_mask = np.isin(categories, list(self._category_mapper.keys()))
         new_categories = categories[~category_mask]
 
@@ -70,7 +70,9 @@ def add_categories(self, categories: np.ndarray) -> None:
 
         # instance and append new Category objects to dataset
         for category_name, category_id in category_dict.items():
-            category = Category(id = category_id, name = str(category_name), supercategory="1")
+            category = Category(
+                id=category_id, name=str(category_name), supercategory="1"
+            )
             self.categories.append(category)
 
         # update existing category_mapper with new categories
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index 3d0b7bc..ef24802 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -27,34 +27,29 @@ def labels_to_dataset(
     src: DatasetReader,
     labels: gpd.GeoDataFrame,
     window_bounds: List[Tuple[int, int]],
-    category_attribute: str = "category_id"
+    category_attribute: str = "category_id",
 ) -> CocoDataset:
     """Move across a given geotiff, converting all intersecting labels to COCO
     annotations and appending them to a COCODataset model. This is done through
-    rasterio.Window objects, the bounds of which you can set with window_bounds
-    (also determines the size of the output images associated with the
-    Annotation instances). The degree of overlap between these windows is
-    determined by the dimensions of the given labels to maximize representation
-    in the resulting dataset.
-
-    The "iscrowd" attribute (see
-    https://cocodataset.org/#format-data)
-    is determined by whether    the respective labels are Polygon or
-    MultiPolygon instances. The "category_id" attribute,    which
-    represents class or category identifiers, is expected to be present
-    in the given labels    GeoDataFrame under the same name.
-
-    :param dataset: CocoDataset model to append images and annotations
-        to
+    rasterio.Window objects, the bounds of which you can set with window_bounds (also
+    determines the size of the output images associated with the Annotation instances).
+    The degree of overlap between these windows is determined by the dimensions of the
+    given labels to maximize representation in the resulting dataset.
+
+    The "iscrowd" attribute (see https://cocodataset.org/#format-data) is determined by
+    whether    the respective labels are Polygon or MultiPolygon instances. The
+    "category_id" attribute,    which represents class or category identifiers, is
+    expected to be present in the given labels    GeoDataFrame under the same name.
+
+    :param dataset: CocoDataset model to append images and annotations to
     :param images_dir: output directory for all label images
     :param src: open rasterio reader for input raster
-    :param labels: GeoDataFrame containing labels and class_info
-        ('category_id')
+    :param labels: GeoDataFrame containing labels and class_info ('category_id')
     :param window_bounds: a list of window_bounds to attempt to use ()
     :param category_attribute: Column containing category_id values
     :return: The COCO dataset with appended Images and Annotations
     """
-    
+
     # Setting nodata and estimating window configuration
     parent_window = window_intersect(input_raster=src, input_vector=labels)
     nodata_value = src.nodata if src.nodata else 0
@@ -62,16 +57,16 @@ def labels_to_dataset(
     coco_profile.update({"dtype": np.uint8, "nodata": nodata_value, "driver": "JPEG"})
     schema = estimate_schema(gdf=labels, src=src, window_bounds=window_bounds)
     n_windows = generate_window_offsets(window=parent_window, schema=schema).shape[0]
-    
+
     # sets dataset.next_source_id and possibly bumps minor version
     dataset.add_source(source_path=pathlib.Path(src.name))
-    
+
     # bumps major version if images_dir has been used in this dataset before
     dataset.verify_new_output_dir(images_dir=images_dir)
 
     # sets dataset._category_mapper
     dataset.add_categories(categories=labels[category_attribute].unique())
-    
+
     for child_window in tqdm(
         window_factory(parent_window=parent_window, schema=schema), total=n_windows
     ):
@@ -134,7 +129,9 @@ def labels_to_dataset(
 
         # Iteratively add Annotation models to dataset (also bumps next_annotation_id)
         with rasterio.open(window_image_path) as windowed_src:
-            for _, window_label in window_labels.sort_values(category_attribute).iterrows():
+            for _, window_label in window_labels.sort_values(
+                category_attribute
+            ).iterrows():
                 label_mask = mask_label(
                     input_raster=windowed_src, label=window_label.geometry
                 )
@@ -147,7 +144,7 @@ def labels_to_dataset(
                 iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0
                 category_name = str(window_label[category_attribute])
                 category_id = dataset._category_mapper[category_name]
-                
+
                 annotation_instance = Annotation(
                     id=dataset.next_annotation_id,
                     image_id=dataset.next_image_id,
diff --git a/geococo/utils.py b/geococo/utils.py
index 10e2e8c..53948d2 100644
--- a/geococo/utils.py
+++ b/geococo/utils.py
@@ -13,14 +13,11 @@
 def mask_label(
     input_raster: DatasetReader, label: Union[Polygon, MultiPolygon]
 ) -> np.ndarray:
-    """Masks out an label from input_raster and flattens it to a 2D binary
-    array. If it doesn't overlap, the resulting mask will only consist of False
-    bools.
-
-    :param input_raster: open rasterio DatasetReader for the input
-        raster
-    :param label: Polygon object representing the area to be masked
-        (i.e. label)
+    """Masks out an label from input_raster and flattens it to a 2D binary array. If it
+    doesn't overlap, the resulting mask will only consist of False bools.
+
+    :param input_raster: open rasterio DatasetReader for the input raster
+    :param label: Polygon object representing the area to be masked (i.e. label)
     :return: A 2D binary array representing the label
     """
 
@@ -39,14 +36,12 @@ def mask_label(
 def window_intersect(
     input_raster: DatasetReader, input_vector: gpd.GeoDataFrame
 ) -> Window:
-    """Generates a Rasterio Window from the intersecting extents of the input
-    data. It also verifies if the input data share the same CRS and if they
-    physically overlap.
+    """Generates a Rasterio Window from the intersecting extents of the input data. It
+    also verifies if the input data share the same CRS and if they physically overlap.
 
     :param input_raster: rasterio dataset (i.e. input image)
     :param input_vector: geopandas geodataframe (i.e. input labels)
-    :return: rasterio window that represent the intersection between
-        input data extents
+    :return: rasterio window that represent the intersection between input data extents
     """
 
     if input_vector.crs != input_raster.crs:
@@ -73,13 +68,11 @@ def window_intersect(
 def reshape_image(
     img_array: np.ndarray, shape: Tuple[int, int, int], padding_value: int = 0
 ) -> np.ndarray:
-    """Reshapes 3D numpy array to match given 3D shape, done through slicing or
-    padding.
+    """Reshapes 3D numpy array to match given 3D shape, done through slicing or padding.
 
     :param img_array: the numpy array to be reshaped
     :param shape: the desired shape (bands, rows, cols)
-    :param padding_value: what value to pad img_array with (if too
-        small)
+    :param padding_value: what value to pad img_array with (if too small)
     :return: numpy array in desired shape
     """
 
@@ -98,14 +91,14 @@ def reshape_image(
 
 
 def generate_window_polygon(datasource: DatasetReader, window: Window) -> Polygon:
-    """Turns the spatial bounds of a given window to a shapely.Polygon object
-    in a given dataset's CRS.
+    """Turns the spatial bounds of a given window to a shapely.Polygon object in a given
+    dataset's CRS.
 
-    :param datasource: a rasterio DatasetReader object that provides the
-        affine transformation
+    :param datasource: a rasterio DatasetReader object that provides the affine
+        transformation
     :param window: bounds to represent as Polygon
-    :return: shapely Polygon representing the spatial bounds of a given
-        window in a given CRS
+    :return: shapely Polygon representing the spatial bounds of a given window in a
+        given CRS
     """
 
     window_transform = datasource.window_transform(window)
@@ -117,8 +110,7 @@ def generate_window_polygon(datasource: DatasetReader, window: Window) -> Polygo
 def generate_window_offsets(window: Window, schema: WindowSchema) -> np.ndarray:
     """Computes an array of window offsets bound by a given window.
 
-    :param window: the bounding window (i.e. offsets will be within its
-        bounds)
+    :param window: the bounding window (i.e. offsets will be within its bounds)
     :param schema: the parameters for the window generator
     :return: an array of window offsets within the bounds of window
     """
@@ -143,14 +135,14 @@ def generate_window_offsets(window: Window, schema: WindowSchema) -> np.ndarray:
 def window_factory(
     parent_window: Window, schema: WindowSchema, boundless: bool = True
 ) -> Generator[Window, None, None]:
-    """Generator that produces rasterio.Window objects in predetermined steps,
-    within the given Window.
+    """Generator that produces rasterio.Window objects in predetermined steps, within
+    the given Window.
 
-    :param parent_window: the window that provides the bounds for all
-        child_window objects
+    :param parent_window: the window that provides the bounds for all child_window
+        objects
     :param schema: the parameters that determine the window steps
-    :param boundless: whether the child_window should be clipped by the
-        parent_window or not
+    :param boundless: whether the child_window should be clipped by the parent_window or
+        not
     :yield: a rasterio.Window used for windowed reading/writing
     """
 
@@ -174,8 +166,7 @@ def estimate_average_bounds(
 ) -> Tuple[float, float]:
     """Estimates the average size of all features in a GeoDataFrame.
 
-    :param gdf: GeoDataFrame that contains all features (i.e.
-        shapely.Geometry objects)
+    :param gdf: GeoDataFrame that contains all features (i.e. shapely.Geometry objects)
     :param quantile: what quantile will represent the feature population
     :return: a tuple of floats representing average width and height
     """
@@ -195,19 +186,16 @@ def estimate_schema(
     quantile: float = 0.9,
     window_bounds: List[Tuple[int, int]] = [(256, 256), (512, 512)],
 ) -> WindowSchema:
-    """Attempts to find a schema that is able to represent the average
-    GeoDataFrame feature (i.e. sufficient overlap) but within the bounds given
-    by window_bounds.
-
-    :param gdf: GeoDataFrame that contains features that determine the
-        degree of overlap
-    :param src: The rasterio DataSource associated with the resulting
-        schema (i.e. bounds and pixelsizes)
+    """Attempts to find a schema that is able to represent the average GeoDataFrame
+    feature (i.e. sufficient overlap) but within the bounds given by window_bounds.
+
+    :param gdf: GeoDataFrame that contains features that determine the degree of overlap
+    :param src: The rasterio DataSource associated with the resulting schema (i.e.
+        bounds and pixelsizes)
     :param quantile: what quantile will represent the feature population
-    :param window_bounds: a list of possible limits for the window
-        generators
-    :return: (if found) a viable WindowSchema with sufficient overlap
-        within the window_bounds
+    :param window_bounds: a list of possible limits for the window generators
+    :return: (if found) a viable WindowSchema with sufficient overlap within the
+        window_bounds
     """
 
     # estimating the required overlap between windows for labels to be represented fully
@@ -243,10 +231,11 @@ def estimate_schema(
     return schema
 
 
-def assert_valid_categories(categories: np.ndarray, max_dtype: str = "<U50") -> np.ndarray:
-    """
-    Checks if all elements in categories array can be represented by strings 
-    of a certain length (defaults to <U50)
+def assert_valid_categories(
+    categories: np.ndarray, max_dtype: str = "<U50"
+) -> np.ndarray:
+    """Checks if all elements in categories array can be represented by strings of a
+    certain length (defaults to <U50)
 
     :param categories: numpy array containing category values
     :param max_dtype: numpy str dtype with char size
@@ -260,9 +249,9 @@ def assert_valid_categories(categories: np.ndarray, max_dtype: str = "<U50") ->
         str_categories = categories.astype(str)
     except Exception as e:
         raise ValueError("Category values need to be castable to str") from e
-    
+
     # checking if categories can be castable to str of a certain length (e.g. <U50)
     if not np.can_cast(str_categories, max_dtype):
         raise ValueError(f"Category values (str) have to fit in {max_dtype}")
-    
-    return str_categories.astype(max_dtype)
\ No newline at end of file
+
+    return str_categories.astype(max_dtype)
diff --git a/tests/conftest.py b/tests/conftest.py
index fb921db..0b19e67 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -90,7 +90,9 @@ def overlapping_labels() -> gpd.GeoDataFrame:
     polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
 
     labels = gpd.GeoDataFrame(
-        geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
+        geometry=polygons,
+        data={"category_id": classes, "class_names": class_names},
+        crs=crs,
     )  # type: ignore
     return labels
 
@@ -114,7 +116,9 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame:
     polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
 
     labels = gpd.GeoDataFrame(
-        geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
+        geometry=polygons,
+        data={"category_id": classes, "class_names": class_names},
+        crs=crs,
     )  # type: ignore
     return labels
 
diff --git a/tests/test_coco_models.py b/tests/test_coco_models.py
index 12aa3e7..a7c8e55 100644
--- a/tests/test_coco_models.py
+++ b/tests/test_coco_models.py
@@ -2,8 +2,6 @@
 import numpy as np
 import pathlib
 from datetime import datetime
-import geopandas as gpd
-import pytest
 from geococo.coco_models import (
     CocoDataset,
     Info,
@@ -62,7 +60,7 @@ def test_dataset_add_images():
     dataset = CocoDataset(info=Info())
     assert dataset.next_annotation_id == 1
     assert dataset.next_image_id == 1
-    
+
     n_images = np.random.randint(2, 10)
 
     for _ in range(n_images):
@@ -78,6 +76,7 @@ def test_dataset_add_images():
     assert n_images == dataset.next_image_id - 1
     assert n_images == len(dataset.images)
 
+
 def test_info():
     """Simple instance test."""
 
@@ -132,9 +131,9 @@ def test_source():
 
 
 def test_dataset_add_sources():
-    """Checks proper incrementation of source_id"""
+    """Checks proper incrementation of source_id."""
 
-    # Bit different from the other ids since we check for duplication 
+    # Bit different from the other ids since we check for duplication
     # and only increment if new
     dataset = CocoDataset(info=Info())
     assert dataset.next_source_id == 0
@@ -147,11 +146,11 @@ def test_dataset_add_sources():
 
 
 def test_dataset_versions():
-    """Checks proper incrementation of dataset versions"""
+    """Checks proper incrementation of dataset versions."""
 
     dataset = CocoDataset(info=Info())
     assert dataset.info.version == "0.0.0"
-    
+
     # minor bump if same output_dir but different raster_source
     dataset.add_source(source_path=pathlib.Path("a"))
     assert dataset.info.version == "0.1.0"
@@ -164,13 +163,14 @@ def test_dataset_versions():
     dataset.verify_new_output_dir(images_dir=pathlib.Path("b"))
     assert dataset.info.version == "1.0.0"
 
+
 def test_add_categories():
-    """Checks independent mapping of category_attribute to class_ids"""
+    """Checks independent mapping of category_attribute to class_ids."""
 
     dataset = CocoDataset(info=Info())
     assert dataset.categories == []
     assert dataset._category_mapper == {}
-    
+
     # adding three unique classes
     categories = np.array(["A", "B", "B", "E", "E"])
     dataset.add_categories(categories=categories)
@@ -179,12 +179,16 @@ def test_add_categories():
     assert np.unique(categories).size == len(dataset._category_mapper)
     assert np.unique(categories).size == len(dataset.categories)
     assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
-    
-    #check if existing key value pairs don't change 
+
+    # check if existing key value pairs don't change
     # done by adding a bunch of existing classes and one new one
     initial_mapper = dataset._category_mapper.copy()
     categories = np.array(["One", "Two", "Two", "Five", "Five", "Six", "Six"])
     dataset.add_categories(categories=categories)
-    subset_mapper = {key: value for key, value in dataset._category_mapper.items() if key in initial_mapper.keys()}
+    subset_mapper = {
+        key: value
+        for key, value in dataset._category_mapper.items()
+        if key in initial_mapper.keys()
+    }
     assert initial_mapper == subset_mapper
-    assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
\ No newline at end of file
+    assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3cf79d4..9198e35 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,7 +12,7 @@
     estimate_average_bounds,
     estimate_schema,
     mask_label,
-    assert_valid_categories
+    assert_valid_categories,
 )
 from geococo.window_schema import WindowSchema
 import geopandas as gpd
@@ -57,7 +57,7 @@ def test_window_intersect(
     overlapping_labels: gpd.GeoDataFrame, test_raster: pathlib.Path
 ) -> None:
     with rasterio.open(test_raster) as raster_source:
-        # outer polygons have diameter of 2 so resulting window 
+        # outer polygons have diameter of 2 so resulting window
         # is 82x82 and offset is 9 (10-2/2)
         window = window_intersect(
             input_raster=raster_source, input_vector=overlapping_labels
@@ -284,27 +284,31 @@ def test_window_factory_boundless() -> None:
     assert np.any(window_extents[:, 1] >= window.height)
 
 
-
 def test_assert_valid_categories() -> None:
-    # almost all python objects can be represented by str so we just try casting and verify char length
+    # almost all python objects can be represented by str
+    # so we just try casting and verify char length
     category_lengths = [10, 49, 50]
-    random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths])
-    
+    random_words = np.array(
+        [
+            "".join(np.random.choice(list(ascii_lowercase), cl))
+            for cl in category_lengths
+        ]
+    )
+
     _ = assert_valid_categories(random_words)
 
-    # float64 
+    # float64
     random_numbers = np.random.randn(3).astype(np.float64)
     _ = assert_valid_categories(random_numbers)
 
     # longer than <U50
     category_lengths = [51, 70, 120]
-    random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl))  for cl in category_lengths])
+    random_words = np.array(
+        [
+            "".join(np.random.choice(list(ascii_lowercase), cl))
+            for cl in category_lengths
+        ]
+    )
 
     with pytest.raises(ValueError):
         _ = assert_valid_categories(random_words)
-
-    
-
-
-
-    
\ No newline at end of file