Merge pull request #6 from jaspersiebring/supercategory

Added (super)category mapping and input data validation
jaspersiebring · Sep 19, 2023 · 1274288 · 1274288
2 parents 1ad95dd + 148bdf5
commit 1274288
Show file tree

Hide file tree

Showing 14 changed files with 1,508 additions and 505 deletions.
diff --git a/geococo/__init__.py b/geococo/__init__.py
@@ -16,7 +16,7 @@
 warnings.filterwarnings(
     "error",
     "The given matrix is equal to Affine.identity or its flipped counterpart. GDAL may"
-     " ignore this matrix and save no geotransform without raising an error. This "
-     "behavior is somewhat driver-specific.",
+    " ignore this matrix and save no geotransform without raising an error. This "
+    "behavior is somewhat driver-specific.",
     category=NotGeoreferencedWarning,
 )
diff --git a/geococo/cli.py b/geococo/cli.py
@@ -18,30 +18,28 @@ def build_coco(
 ) -> None:
     """Transform your GIS annotations into a COCO dataset.
 
-    This method generates a COCO dataset by moving across the given
-    image (image_path) with a moving window (image_size), constantly
-    checking for intersecting annotations (labels_path) that represent
-    image objects in said image (e.g. buildings in satellite imagery;
-    denoted by category_attribute). Each valid intersection will add n
-    Annotations entries to the dataset (json_path) and save a subset of
-    the input image that contained these entries (output_dir).
+    This method generates a COCO dataset by moving across the given image (image_path)
+    with a moving window (image_size), constantly checking for intersecting annotations
+    (labels_path) that represent image objects in said image (e.g. buildings in
+    satellite imagery; denoted by category_attribute). Each valid intersection will add
+    n Annotations entries to the dataset (json_path) and save a subset of the input
+    image that contained these entries (output_dir).
 
-    The output data size depends on your input labels, as the moving
-    window adjusts its step size to accommodate the average annotation
-    size, optimizing dataset representation and minimizing tool
-    configuration.
+    The output data size depends on your input labels, as the moving window adjusts its
+    step size to accommodate the average annotation size, optimizing dataset
+    representation and minimizing tool configuration.
 
-    :param image_path: Path to the geospatial image containing image
-        objects (e.g. buildings in satellite imagery)
-    :param labels_path: Path to the annotations representing these image
-        objects (='category_id')
-    :param json_path: Path to the json file that will store the COCO
-        dataset (will be appended to if already exists)
+    :param image_path: Path to the geospatial image containing image objects (e.g.
+        buildings in satellite imagery)
+    :param labels_path: Path to the annotations representing these image objects
+        (='category_id')
+    :param json_path: Path to the json file that will store the COCO dataset (will be
+        appended to if already exists)
     :param output_dir: Path to the output directory for image subsets
     :param width: Width of the output images
     :param height: Height of the output images
-    :param category_attribute: Column that contains category_id values
-        per annotation feature
+    :param category_attribute: Column that contains category_id values per annotation
+        feature
     """
 
     if isinstance(json_path, pathlib.Path) and json_path.exists():

diff --git a/geococo/coco_manager.py b/geococo/coco_manager.py
@@ -5,18 +5,17 @@
 
 
 def load_dataset(json_path: pathlib.Path) -> CocoDataset:
-    """Dumps the contents of json_path as a string, interprets it as a
-    CocoDataset model and returns it.
+    """Dumps the contents of json_path as a string, interprets it as a CocoDataset model
+    and returns it.
 
-    :param json_path: path to the JSON file containing the json-encoded
-        COCO dataset
-    :return: An instance of CocoDataset with loaded Image- and
-        Annotation objects from json_path
+    :param json_path: path to the JSON file containing the json-encoded COCO dataset
+    :return: An instance of CocoDataset with loaded Image- and Annotation objects from
+        json_path
     """
 
     with open(json_path, mode="r", encoding="utf-8") as json_fp:
         json_data = json_fp.read()
-    dataset = CocoDataset.model_validate_json(json_data)
+    dataset = CocoDataset.parse_raw(json_data)
     return dataset
 
 
@@ -26,17 +25,15 @@ def create_dataset(
     version: str = str(Version(major=0)),
     date_created: datetime = datetime.now(),
 ) -> CocoDataset:
-    """
-    Instances and returns a new CocoDataset model with given kwargs.
+    """Instances and returns a new CocoDataset model with given kwargs.
 
     :param description: Description of your COCO dataset
-    :param contributor: Main contributors of your COCO dataset, its
-        images and its annotations
+    :param contributor: Main contributors of your COCO dataset, its images and its
+        annotations
     :param version: Initial SemVer version (defaults to 0.0.0)
-    :param date_created: Date when dataset was initially created,
-        defaults to datetime.now()
-    :return: An instance of CocoDataset without Image- and Annotation
-        objects
+    :param date_created: Date when dataset was initially created, defaults to
+        datetime.now()
+    :return: An instance of CocoDataset without Image- and Annotation objects
     """
 
     info = Info(
@@ -54,10 +51,9 @@ def save_dataset(dataset: CocoDataset, json_path: pathlib.Path) -> None:
     """JSON-encodes an instance of CocoDataset and saves it to json_path.
 
     :param dataset: An instance of CocoDataset
-    :param json_path: where to save the JSON-encoded CocoDataset
-        instance to
+    :param json_path: where to save the JSON-encoded CocoDataset instance to
     """
 
-    json_data = dataset.model_dump_json()
+    json_data = dataset.json()
     with open(json_path, mode="w", encoding="utf-8") as dst:
         dst.write(json_data)
diff --git a/geococo/coco_models.py b/geococo/coco_models.py
@@ -1,46 +1,39 @@
 from __future__ import annotations
 import numpy as np
 import pathlib
+import pandas as pd
 from datetime import datetime
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Any
 from typing_extensions import TypedDict
-
-from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
+from pydantic import BaseModel, root_validator
+from pydantic.fields import Field
 from semver.version import Version
-from geococo.utils import assert_valid_categories
 
 
 class CocoDataset(BaseModel):
     info: Info
-    images: List[InstanceOf[Image]] = []
-    annotations: List[InstanceOf[Annotation]] = []
-    categories: List[InstanceOf[Category]] = []
-    sources: List[InstanceOf[Source]] = []
-    _next_image_id: int = 1
-    _next_annotation_id: int = 1
-    _next_source_id: int = 1
-    _category_mapper: Dict = {}
-
-    @model_validator(mode="after")
-    def _set_ids(self) -> CocoDataset:
-        self._next_image_id = len(self.images) + 1
-        self._next_annotation_id = len(self.annotations) + 1
-        self._next_source_id = len(self.sources)
-        self._category_mapper = self._get_category_mapper()
-        return self
-
-    def _get_category_mapper(self) -> Dict:
-        category_data = [(category.name, category.id) for category in self.categories]
-        category_mapper = dict(category_data) if category_data else {}
-        return category_mapper
+    images: List[Image] = []
+    annotations: List[Annotation] = []
+    categories: List[Category] = []
+    sources: List[Source] = []
+    next_image_id: int = Field(default=1, exclude=True)
+    next_annotation_id: int = Field(default=1, exclude=True)
+    next_source_id: int = Field(default=1, exclude=True)
+
+    @root_validator
+    def _set_ids(cls: CocoDataset, values: Dict[str, Any]) -> Dict[str, Any]:
+        values["next_image_id"] = len(values["images"]) + 1
+        values["next_annotation_id"] = len(values["annotations"]) + 1
+        values["next_source_id"] = len(values["sources"])
+        return values
 
     def add_annotation(self, annotation: Annotation) -> None:
         self.annotations.append(annotation)
-        self._next_annotation_id += 1
+        self.next_annotation_id += 1
 
     def add_image(self, image: Image) -> None:
         self.images.append(image)
-        self._next_image_id += 1
+        self.next_image_id += 1
 
     def add_source(self, source_path: pathlib.Path) -> None:
         sources = [ssrc for ssrc in self.sources if ssrc.file_name == source_path]
@@ -53,31 +46,78 @@ def add_source(self, source_path: pathlib.Path) -> None:
             self.sources.append(source)
             self.bump_version(bump_method="minor")
 
-        self._next_source_id = source.id
-
-    def add_categories(self, categories: np.ndarray) -> None:
-        # checking if categories are castable to str and under a certain size
-        categories = assert_valid_categories(categories=np.unique(categories))
-
-        # filtering existing categories
-        category_mask = np.isin(categories, list(self._category_mapper.keys()))
-        new_categories = categories[~category_mask]
-
-        # generating mapper from new categories
-        start = len(self._category_mapper.values()) + 1
-        end = start + new_categories.size
-        category_dict = dict(zip(new_categories, np.arange(start, end)))
+        self.next_source_id = source.id
+
+    def add_categories(
+        self,
+        category_ids: Optional[np.ndarray],
+        category_names: Optional[np.ndarray],
+        supercategory_names: Optional[np.ndarray],
+    ) -> None:
+        # initializing values
+        super_default = "1"
+        names_present = ids_present = False
+
+        # Loading all existing Category instances as a single dataframe
+        category_pd = pd.DataFrame(
+            [category.dict() for category in self.categories],
+            columns=Category.schema()["properties"].keys(),
+        )
+
+        # checking if names can be assigned to uid_array (used to check duplicates)
+        if isinstance(category_names, np.ndarray):
+            uid_array = category_names
+            uid_attribute = "name"
+            names_present = True
+
+        # checking if ids can be assigned to uid_array (used to check duplicates)
+        if isinstance(category_ids, np.ndarray):
+            uid_array = category_ids  # overrides existing array because ids are leading
+            uid_attribute = "id"
+            ids_present = True
+        if not names_present and not ids_present:
+            raise AttributeError("At least one category attribute must be present")
+
+        # masking out duplicate values and exiting if all duplicates
+        original_shape = uid_array.shape
+        _, indices = np.unique(uid_array, return_index=True)
+        uid_array = uid_array[indices]
+        member_mask = np.isin(uid_array, category_pd[uid_attribute])
+        new_members = uid_array[~member_mask]
+        new_shape = new_members.shape
+        if new_shape[0] == 0:
+            return
+
+        # creating default supercategory_names if not given
+        if not isinstance(supercategory_names, np.ndarray):
+            supercategory_names = np.full(shape=new_shape, fill_value=super_default)
+        else:
+            assert supercategory_names.shape == original_shape
+            supercategory_names = supercategory_names[indices][~member_mask]
+
+        # creating default category_names if not given (str version of ids)
+        if ids_present and not names_present:
+            category_names = new_members.astype(str)
+            category_ids = new_members
+        # creating ids if not given (incremental sequence starting from last known id)
+        elif names_present and not ids_present:
+            pandas_mask = category_pd[uid_attribute].isin(uid_array[member_mask])
+            max_id = category_pd.loc[pandas_mask, "id"].max()
+            start = np.nansum([max_id, 1])
+            end = start + new_members.size
+            category_ids = np.arange(start, end)
+            category_names = new_members
+        # ensuring equal size for category names and ids (if given)
+        else:
+            assert category_names.shape == original_shape # type: ignore
+            category_names = category_names[indices][~member_mask] # type: ignore
+            category_ids = new_members
 
-        # instance and append new Category objects to dataset
-        for category_name, category_id in category_dict.items():
-            category = Category(
-                id=category_id, name=str(category_name), supercategory="1"
-            )
+        # iteratively instancing and appending Category from set ids, names and supers
+        for cid, name, super in zip(category_ids, category_names, supercategory_names):
+            category = Category(id=cid, name=name, supercategory=super)
             self.categories.append(category)
 
-        # update existing category_mapper with new categories
-        self._category_mapper.update(category_dict)
-
     def bump_version(self, bump_method: str) -> None:
         bump_methods = ["patch", "minor", "major"]
         version = Version.parse(self.info.version)
@@ -98,18 +138,6 @@ def verify_new_output_dir(self, images_dir: pathlib.Path) -> None:
         if images_dir not in output_dirs:
             self.bump_version(bump_method="major")
 
-    @property
-    def next_image_id(self) -> int:
-        return self._next_image_id
-
-    @property
-    def next_annotation_id(self) -> int:
-        return self._next_annotation_id
-
-    @property
-    def next_source_id(self) -> int:
-        return self._next_source_id
-
 
 class Info(BaseModel):
     version: str = str(Version(major=0))
@@ -120,7 +148,6 @@ class Info(BaseModel):
 
 
 class Image(BaseModel):
-    model_config = ConfigDict(frozen=True)
     id: int
     width: int
     height: int
@@ -129,7 +156,6 @@ class Image(BaseModel):
 
 
 class Annotation(BaseModel):
-    model_config = ConfigDict(frozen=True)
     id: int
     image_id: int
     category_id: int
@@ -140,7 +166,6 @@ class Annotation(BaseModel):
 
 
 class Category(BaseModel):
-    model_config = ConfigDict(frozen=True)
     id: int
     name: str
     supercategory: str
@@ -154,3 +179,12 @@ class RleDict(TypedDict):
 class Source(BaseModel):
     id: int
     file_name: pathlib.Path
+
+
+# Call update_forward_refs() to resolve forward references (for pydantic <2.0.0)
+CocoDataset.update_forward_refs()
+Info.update_forward_refs()
+Image.update_forward_refs()
+Annotation.update_forward_refs()
+Category.update_forward_refs()
+Source.update_forward_refs()