Skip to content

Commit

Permalink
Merge pull request #6 from jaspersiebring/supercategory
Browse files Browse the repository at this point in the history
Added (super)category mapping and input data validation
  • Loading branch information
jaspersiebring authored Sep 19, 2023
2 parents 1ad95dd + 148bdf5 commit 1274288
Show file tree
Hide file tree
Showing 14 changed files with 1,508 additions and 505 deletions.
4 changes: 2 additions & 2 deletions geococo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
warnings.filterwarnings(
"error",
"The given matrix is equal to Affine.identity or its flipped counterpart. GDAL may"
" ignore this matrix and save no geotransform without raising an error. This "
"behavior is somewhat driver-specific.",
" ignore this matrix and save no geotransform without raising an error. This "
"behavior is somewhat driver-specific.",
category=NotGeoreferencedWarning,
)
36 changes: 17 additions & 19 deletions geococo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,28 @@ def build_coco(
) -> None:
"""Transform your GIS annotations into a COCO dataset.
This method generates a COCO dataset by moving across the given
image (image_path) with a moving window (image_size), constantly
checking for intersecting annotations (labels_path) that represent
image objects in said image (e.g. buildings in satellite imagery;
denoted by category_attribute). Each valid intersection will add n
Annotations entries to the dataset (json_path) and save a subset of
the input image that contained these entries (output_dir).
This method generates a COCO dataset by moving across the given image (image_path)
with a moving window (image_size), constantly checking for intersecting annotations
(labels_path) that represent image objects in said image (e.g. buildings in
satellite imagery; denoted by category_attribute). Each valid intersection will add
n Annotations entries to the dataset (json_path) and save a subset of the input
image that contained these entries (output_dir).
The output data size depends on your input labels, as the moving
window adjusts its step size to accommodate the average annotation
size, optimizing dataset representation and minimizing tool
configuration.
The output data size depends on your input labels, as the moving window adjusts its
step size to accommodate the average annotation size, optimizing dataset
representation and minimizing tool configuration.
:param image_path: Path to the geospatial image containing image
objects (e.g. buildings in satellite imagery)
:param labels_path: Path to the annotations representing these image
objects (='category_id')
:param json_path: Path to the json file that will store the COCO
dataset (will be appended to if already exists)
:param image_path: Path to the geospatial image containing image objects (e.g.
buildings in satellite imagery)
:param labels_path: Path to the annotations representing these image objects
(='category_id')
:param json_path: Path to the json file that will store the COCO dataset (will be
appended to if already exists)
:param output_dir: Path to the output directory for image subsets
:param width: Width of the output images
:param height: Height of the output images
:param category_attribute: Column that contains category_id values
per annotation feature
:param category_attribute: Column that contains category_id values per annotation
feature
"""

if isinstance(json_path, pathlib.Path) and json_path.exists():
Expand Down
32 changes: 14 additions & 18 deletions geococo/coco_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,17 @@


def load_dataset(json_path: pathlib.Path) -> CocoDataset:
"""Dumps the contents of json_path as a string, interprets it as a
CocoDataset model and returns it.
"""Dumps the contents of json_path as a string, interprets it as a CocoDataset model
and returns it.
:param json_path: path to the JSON file containing the json-encoded
COCO dataset
:return: An instance of CocoDataset with loaded Image- and
Annotation objects from json_path
:param json_path: path to the JSON file containing the json-encoded COCO dataset
:return: An instance of CocoDataset with loaded Image- and Annotation objects from
json_path
"""

with open(json_path, mode="r", encoding="utf-8") as json_fp:
json_data = json_fp.read()
dataset = CocoDataset.model_validate_json(json_data)
dataset = CocoDataset.parse_raw(json_data)
return dataset


Expand All @@ -26,17 +25,15 @@ def create_dataset(
version: str = str(Version(major=0)),
date_created: datetime = datetime.now(),
) -> CocoDataset:
"""
Instances and returns a new CocoDataset model with given kwargs.
"""Instances and returns a new CocoDataset model with given kwargs.
:param description: Description of your COCO dataset
:param contributor: Main contributors of your COCO dataset, its
images and its annotations
:param contributor: Main contributors of your COCO dataset, its images and its
annotations
:param version: Initial SemVer version (defaults to 0.0.0)
:param date_created: Date when dataset was initially created,
defaults to datetime.now()
:return: An instance of CocoDataset without Image- and Annotation
objects
:param date_created: Date when dataset was initially created, defaults to
datetime.now()
:return: An instance of CocoDataset without Image- and Annotation objects
"""

info = Info(
Expand All @@ -54,10 +51,9 @@ def save_dataset(dataset: CocoDataset, json_path: pathlib.Path) -> None:
"""JSON-encodes an instance of CocoDataset and saves it to json_path.
:param dataset: An instance of CocoDataset
:param json_path: where to save the JSON-encoded CocoDataset
instance to
:param json_path: where to save the JSON-encoded CocoDataset instance to
"""

json_data = dataset.model_dump_json()
json_data = dataset.json()
with open(json_path, mode="w", encoding="utf-8") as dst:
dst.write(json_data)
162 changes: 98 additions & 64 deletions geococo/coco_models.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,39 @@
from __future__ import annotations
import numpy as np
import pathlib
import pandas as pd
from datetime import datetime
from typing import List, Optional, Dict
from typing import List, Optional, Dict, Any
from typing_extensions import TypedDict

from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
from pydantic import BaseModel, root_validator
from pydantic.fields import Field
from semver.version import Version
from geococo.utils import assert_valid_categories


class CocoDataset(BaseModel):
info: Info
images: List[InstanceOf[Image]] = []
annotations: List[InstanceOf[Annotation]] = []
categories: List[InstanceOf[Category]] = []
sources: List[InstanceOf[Source]] = []
_next_image_id: int = 1
_next_annotation_id: int = 1
_next_source_id: int = 1
_category_mapper: Dict = {}

@model_validator(mode="after")
def _set_ids(self) -> CocoDataset:
self._next_image_id = len(self.images) + 1
self._next_annotation_id = len(self.annotations) + 1
self._next_source_id = len(self.sources)
self._category_mapper = self._get_category_mapper()
return self

def _get_category_mapper(self) -> Dict:
category_data = [(category.name, category.id) for category in self.categories]
category_mapper = dict(category_data) if category_data else {}
return category_mapper
images: List[Image] = []
annotations: List[Annotation] = []
categories: List[Category] = []
sources: List[Source] = []
next_image_id: int = Field(default=1, exclude=True)
next_annotation_id: int = Field(default=1, exclude=True)
next_source_id: int = Field(default=1, exclude=True)

@root_validator
def _set_ids(cls: CocoDataset, values: Dict[str, Any]) -> Dict[str, Any]:
values["next_image_id"] = len(values["images"]) + 1
values["next_annotation_id"] = len(values["annotations"]) + 1
values["next_source_id"] = len(values["sources"])
return values

def add_annotation(self, annotation: Annotation) -> None:
self.annotations.append(annotation)
self._next_annotation_id += 1
self.next_annotation_id += 1

def add_image(self, image: Image) -> None:
self.images.append(image)
self._next_image_id += 1
self.next_image_id += 1

def add_source(self, source_path: pathlib.Path) -> None:
sources = [ssrc for ssrc in self.sources if ssrc.file_name == source_path]
Expand All @@ -53,31 +46,78 @@ def add_source(self, source_path: pathlib.Path) -> None:
self.sources.append(source)
self.bump_version(bump_method="minor")

self._next_source_id = source.id

def add_categories(self, categories: np.ndarray) -> None:
# checking if categories are castable to str and under a certain size
categories = assert_valid_categories(categories=np.unique(categories))

# filtering existing categories
category_mask = np.isin(categories, list(self._category_mapper.keys()))
new_categories = categories[~category_mask]

# generating mapper from new categories
start = len(self._category_mapper.values()) + 1
end = start + new_categories.size
category_dict = dict(zip(new_categories, np.arange(start, end)))
self.next_source_id = source.id

def add_categories(
self,
category_ids: Optional[np.ndarray],
category_names: Optional[np.ndarray],
supercategory_names: Optional[np.ndarray],
) -> None:
# initializing values
super_default = "1"
names_present = ids_present = False

# Loading all existing Category instances as a single dataframe
category_pd = pd.DataFrame(
[category.dict() for category in self.categories],
columns=Category.schema()["properties"].keys(),
)

# checking if names can be assigned to uid_array (used to check duplicates)
if isinstance(category_names, np.ndarray):
uid_array = category_names
uid_attribute = "name"
names_present = True

# checking if ids can be assigned to uid_array (used to check duplicates)
if isinstance(category_ids, np.ndarray):
uid_array = category_ids # overrides existing array because ids are leading
uid_attribute = "id"
ids_present = True
if not names_present and not ids_present:
raise AttributeError("At least one category attribute must be present")

# masking out duplicate values and exiting if all duplicates
original_shape = uid_array.shape
_, indices = np.unique(uid_array, return_index=True)
uid_array = uid_array[indices]
member_mask = np.isin(uid_array, category_pd[uid_attribute])
new_members = uid_array[~member_mask]
new_shape = new_members.shape
if new_shape[0] == 0:
return

# creating default supercategory_names if not given
if not isinstance(supercategory_names, np.ndarray):
supercategory_names = np.full(shape=new_shape, fill_value=super_default)
else:
assert supercategory_names.shape == original_shape
supercategory_names = supercategory_names[indices][~member_mask]

# creating default category_names if not given (str version of ids)
if ids_present and not names_present:
category_names = new_members.astype(str)
category_ids = new_members
# creating ids if not given (incremental sequence starting from last known id)
elif names_present and not ids_present:
pandas_mask = category_pd[uid_attribute].isin(uid_array[member_mask])
max_id = category_pd.loc[pandas_mask, "id"].max()
start = np.nansum([max_id, 1])
end = start + new_members.size
category_ids = np.arange(start, end)
category_names = new_members
# ensuring equal size for category names and ids (if given)
else:
assert category_names.shape == original_shape # type: ignore
category_names = category_names[indices][~member_mask] # type: ignore
category_ids = new_members

# instance and append new Category objects to dataset
for category_name, category_id in category_dict.items():
category = Category(
id=category_id, name=str(category_name), supercategory="1"
)
# iteratively instancing and appending Category from set ids, names and supers
for cid, name, super in zip(category_ids, category_names, supercategory_names):
category = Category(id=cid, name=name, supercategory=super)
self.categories.append(category)

# update existing category_mapper with new categories
self._category_mapper.update(category_dict)

def bump_version(self, bump_method: str) -> None:
bump_methods = ["patch", "minor", "major"]
version = Version.parse(self.info.version)
Expand All @@ -98,18 +138,6 @@ def verify_new_output_dir(self, images_dir: pathlib.Path) -> None:
if images_dir not in output_dirs:
self.bump_version(bump_method="major")

@property
def next_image_id(self) -> int:
return self._next_image_id

@property
def next_annotation_id(self) -> int:
return self._next_annotation_id

@property
def next_source_id(self) -> int:
return self._next_source_id


class Info(BaseModel):
version: str = str(Version(major=0))
Expand All @@ -120,7 +148,6 @@ class Info(BaseModel):


class Image(BaseModel):
model_config = ConfigDict(frozen=True)
id: int
width: int
height: int
Expand All @@ -129,7 +156,6 @@ class Image(BaseModel):


class Annotation(BaseModel):
model_config = ConfigDict(frozen=True)
id: int
image_id: int
category_id: int
Expand All @@ -140,7 +166,6 @@ class Annotation(BaseModel):


class Category(BaseModel):
model_config = ConfigDict(frozen=True)
id: int
name: str
supercategory: str
Expand All @@ -154,3 +179,12 @@ class RleDict(TypedDict):
class Source(BaseModel):
id: int
file_name: pathlib.Path


# Call update_forward_refs() to resolve forward references (for pydantic <2.0.0)
CocoDataset.update_forward_refs()
Info.update_forward_refs()
Image.update_forward_refs()
Annotation.update_forward_refs()
Category.update_forward_refs()
Source.update_forward_refs()
Loading

0 comments on commit 1274288

Please sign in to comment.