From 5d60b0681c3ae8abcba65e31ef433c04913a8fb7 Mon Sep 17 00:00:00 2001
From: Jasper
Date: Tue, 12 Sep 2023 23:38:40 +0200
Subject: [PATCH 1/8] Added option to specify category_attribute (defaults to
'category_id'). Also bumped package version.
---
geococo/coco_processing.py | 6 ++++--
pyproject.toml | 2 +-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index 3cfdc10..3406ad4 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -27,6 +27,7 @@ def labels_to_dataset(
src: DatasetReader,
labels: gpd.GeoDataFrame,
window_bounds: List[Tuple[int, int]],
+ category_attribute: str = "category_id"
) -> CocoDataset:
"""Move across a given geotiff, converting all intersecting labels to COCO
annotations and appending them to a COCODataset model. This is done through
@@ -50,6 +51,7 @@ def labels_to_dataset(
:param labels: GeoDataFrame containing labels and class_info
('category_id')
:param window_bounds: a list of window_bounds to attempt to use ()
+ :param category_attribute: Column containing category_id values
:return: The COCO dataset with appended Images and Annotations
"""
@@ -129,7 +131,7 @@ def labels_to_dataset(
# Iteratively add Annotation models to dataset (also bumps next_annotation_id)
with rasterio.open(window_image_path) as windowed_src:
- for _, window_label in window_labels.sort_values("category_id").iterrows():
+ for _, window_label in window_labels.sort_values(category_attribute).iterrows():
label_mask = mask_label(
input_raster=windowed_src, label=window_label.geometry
)
@@ -144,7 +146,7 @@ def labels_to_dataset(
annotation_instance = Annotation(
id=dataset.next_annotation_id,
image_id=dataset.next_image_id,
- category_id=window_label["category_id"],
+ category_id=window_label[category_attribute],
segmentation=rle, # type: ignore
area=area,
bbox=bounding_box,
diff --git a/pyproject.toml b/pyproject.toml
index 0f74671..579a0ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "geococo"
-version = "0.2.1"
+version = "0.3.0"
description = "Converts GIS annotations to Microsoft's Common Objects In Context (COCO) dataset format"
authors = ["Jasper "]
readme = "README.md"
From 4e652c70e81f24c24b12605521b9939f7da33513 Mon Sep 17 00:00:00 2001
From: Jasper
Date: Wed, 13 Sep 2023 00:32:30 +0200
Subject: [PATCH 2/8] Dropped 'Planned features' ref for brevity sake (there's
too many planned features this early in development and I don't want to
update it every release)
---
README.md | 5 -----
1 file changed, 5 deletions(-)
diff --git a/README.md b/README.md
index bc36288..5e1be37 100644
--- a/README.md
+++ b/README.md
@@ -158,8 +158,3 @@ session = fo.launch_app(coco_dataset, port=5151)
-
-
-# Planned features
-- [QGIS plugin](https://github.com/jaspersiebring/geococo-qgis-plugin).
-- Data visualization with `pycocotool`'s plotting functionality
From c26527f8c3f1f05e07c51a5d2c44393fd90457dc Mon Sep 17 00:00:00 2001
From: Jasper
Date: Wed, 13 Sep 2023 00:59:46 +0200
Subject: [PATCH 3/8] labels_to_dataset now also adds Category instances to the
COCO dataset. The ids for these instances are set by a category_mapper which
is built from any existing categories in said dataset.
---
geococo/coco_models.py | 28 +++++++++++++++++++++++++++-
geococo/coco_processing.py | 10 +++++++---
2 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 6d38e98..69e93ff 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -2,8 +2,9 @@
import numpy as np
import pathlib
from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union, Dict
from typing_extensions import TypedDict
+from numpy.typing import ArrayLike
from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
from semver.version import Version
@@ -18,14 +19,21 @@ class CocoDataset(BaseModel):
_next_image_id: int = 1
_next_annotation_id: int = 1
_next_source_id: int = 1
+ _category_mapper: Dict = {}
@model_validator(mode="after")
def _set_ids(self) -> CocoDataset:
self._next_image_id = len(self.images) + 1
self._next_annotation_id = len(self.annotations) + 1
self._next_source_id = len(self.sources)
+ self._category_mapper = self._get_category_mapper()
return self
+ def _get_category_mapper(self) -> Dict:
+ category_data = [(category.name, category.id) for category in self.categories]
+ category_mapper = dict(category_data) if category_data else {}
+ return category_mapper
+
def add_annotation(self, annotation: Annotation) -> None:
self.annotations.append(annotation)
self._next_annotation_id += 1
@@ -47,6 +55,24 @@ def add_source(self, source_path: pathlib.Path) -> None:
self._next_source_id = source.id
+ def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> None:
+ # filtering existing categories
+ category_mask = np.isin(categories, self._category_mapper.keys())
+ new_categories = categories[~category_mask]
+
+ # generating mapper from new categories
+ start = len(self._category_mapper.values()) + 1
+ end = start + new_categories.size
+ category_dict = dict(zip(new_categories, np.arange(start, end)))
+
+ # instance and append new Category objects to dataset
+ for category_name, category_id in category_dict.items():
+ category = Category(id = category_id, name = category_name, supercategory="1")
+ self.categories.append(category)
+
+ # update existing category_mapper with new categories
+ self._category_mapper.update(category_dict)
+
def bump_version(self, bump_method: str) -> None:
bump_methods = ["patch", "minor", "major"]
version = Version.parse(self.info.version)
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index 3406ad4..d0205dd 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -54,7 +54,7 @@ def labels_to_dataset(
:param category_attribute: Column containing category_id values
:return: The COCO dataset with appended Images and Annotations
"""
-
+
# Setting nodata and estimating window configuration
parent_window = window_intersect(input_raster=src, input_vector=labels)
nodata_value = src.nodata if src.nodata else 0
@@ -68,6 +68,9 @@ def labels_to_dataset(
# bumps major version if images_dir has been used in this dataset before
dataset.verify_new_output_dir(images_dir=images_dir)
+
+ # sets dataset._category_mapper
+ dataset.add_categories(categories=labels[category_attribute].unique())
for child_window in tqdm(
window_factory(parent_window=parent_window, schema=schema), total=n_windows
@@ -142,11 +145,12 @@ def labels_to_dataset(
bounding_box = cv2.boundingRect(label_mask.astype(np.uint8))
area = np.sum(label_mask)
iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0
-
+ category_id= dataset._category_mapper[window_label[category_attribute]]
+
annotation_instance = Annotation(
id=dataset.next_annotation_id,
image_id=dataset.next_image_id,
- category_id=window_label[category_attribute],
+ category_id=category_id,
segmentation=rle, # type: ignore
area=area,
bbox=bounding_box,
From 4dd4fd440ce5dd6fa2149e49e300c2d43be6b8f8 Mon Sep 17 00:00:00 2001
From: Jasper
Date: Wed, 13 Sep 2023 01:12:20 +0200
Subject: [PATCH 4/8] Added class_names attribute with string values to
gpd.GeoDataFrame objects returned by fixtures.
---
tests/conftest.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/tests/conftest.py b/tests/conftest.py
index 8abfcf5..fb921db 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -77,6 +77,8 @@ def overlapping_labels() -> gpd.GeoDataFrame:
crs = CRS.from_epsg(3857)
classes = [1, 2, 2, 5, 5]
+ class_names = ["One", "Two", "Two", "Five", "Five"]
+
points = [
Point(10, -10),
Point(30, -30),
@@ -88,7 +90,7 @@ def overlapping_labels() -> gpd.GeoDataFrame:
polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
labels = gpd.GeoDataFrame(
- geometry=polygons, data={"category_id": classes}, crs=crs
+ geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
) # type: ignore
return labels
@@ -100,6 +102,7 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame:
crs = CRS.from_epsg(3857)
classes = [1, 2, 2, 5, 5]
+ class_names = ["One", "Two", "Two", "Five", "Five"]
points = [
Point(510, -510),
Point(530, -530),
@@ -111,7 +114,7 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame:
polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
labels = gpd.GeoDataFrame(
- geometry=polygons, data={"category_id": classes}, crs=crs
+ geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
) # type: ignore
return labels
From db3e8f0c2bb05173493a0cd84ef6fd88f354a0af Mon Sep 17 00:00:00 2001
From: Jasper
Date: Wed, 13 Sep 2023 03:26:36 +0200
Subject: [PATCH 5/8] Added and refactored tests related to category_attribute
values.
---
geococo/coco_models.py | 11 +++++++----
geococo/coco_processing.py | 3 ++-
geococo/utils.py | 25 +++++++++++++++++++++++++
tests/test_coco_models.py | 29 ++++++++++++++++++++++++++++-
tests/test_coco_processing.py | 18 ++++++++++--------
tests/test_utils.py | 30 ++++++++++++++++++++++++++++++
6 files changed, 102 insertions(+), 14 deletions(-)
diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 69e93ff..77c2578 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -8,9 +8,9 @@
from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
from semver.version import Version
+from geococo.utils import assert_valid_categories
-
-class CocoDataset(BaseModel):
+class CocoDataset(BaseModel):
info: Info
images: List[InstanceOf[Image]] = []
annotations: List[InstanceOf[Annotation]] = []
@@ -55,7 +55,10 @@ def add_source(self, source_path: pathlib.Path) -> None:
self._next_source_id = source.id
- def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> None:
+ def add_categories(self, categories: np.ndarray) -> None:
+ # checking if categories are castable to str and under a certain size
+ categories = assert_valid_categories(categories=np.unique(categories))
+
# filtering existing categories
category_mask = np.isin(categories, self._category_mapper.keys())
new_categories = categories[~category_mask]
@@ -67,7 +70,7 @@ def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> N
# instance and append new Category objects to dataset
for category_name, category_id in category_dict.items():
- category = Category(id = category_id, name = category_name, supercategory="1")
+ category = Category(id = category_id, name = str(category_name), supercategory="1")
self.categories.append(category)
# update existing category_mapper with new categories
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index d0205dd..3d0b7bc 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -145,7 +145,8 @@ def labels_to_dataset(
bounding_box = cv2.boundingRect(label_mask.astype(np.uint8))
area = np.sum(label_mask)
iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0
- category_id= dataset._category_mapper[window_label[category_attribute]]
+ category_name = str(window_label[category_attribute])
+ category_id = dataset._category_mapper[category_name]
annotation_instance = Annotation(
id=dataset.next_annotation_id,
diff --git a/geococo/utils.py b/geococo/utils.py
index 78f2d9e..10e2e8c 100644
--- a/geococo/utils.py
+++ b/geococo/utils.py
@@ -241,3 +241,28 @@ def estimate_schema(
) from last_exception
return schema
+
+
+def assert_valid_categories(categories: np.ndarray, max_dtype: str = " np.ndarray:
+ """
+ Checks if all elements in categories array can be represented by strings
+ of a certain length (defaults to None:
json_path = tmp_path / "dataset.json"
+ category_attribute = "category_id"
with rasterio.open(test_raster) as raster_source:
# Creating empty CocoDataset as input for labels_to_dataset
@@ -24,13 +25,13 @@ def test_labels_to_dataset_new_dataset(
src=raster_source,
labels=overlapping_labels,
window_bounds=[(256, 256)],
+ category_attribute=category_attribute
)
# Checking if output has correct classes
- unique_ann_ids = np.unique([ann.category_id for ann in dataset.annotations])
- assert np.all(
- np.isin(unique_ann_ids, overlapping_labels["category_id"].unique())
- )
+ dataset_class_names = np.array([cat.name for cat in dataset.categories])
+ labels_class_names = overlapping_labels[category_attribute].unique().astype(str)
+ assert np.all(np.isin(dataset_class_names, labels_class_names))
# Dumping to JSON
dst_json_data = dataset.model_dump_json()
@@ -51,6 +52,7 @@ def test_labels_to_dataset_append_dataset(
test_raster: pathlib.Path,
overlapping_labels: gpd.GeoDataFrame,
) -> None:
+ category_attribute = "category_id"
with rasterio.open(test_raster) as raster_source:
# Creating empty CocoDataset as input for labels_to_dataset
info = Info(version="0.0.1", date_created=datetime.now())
@@ -61,13 +63,13 @@ def test_labels_to_dataset_append_dataset(
src=raster_source,
labels=overlapping_labels,
window_bounds=[(256, 256)],
+ category_attribute=category_attribute
)
# Checking if output has correct classes
- unique_ann_ids = np.unique([ann.category_id for ann in dataset.annotations])
- assert np.all(
- np.isin(unique_ann_ids, overlapping_labels["category_id"].unique())
- )
+ dataset_class_names = np.array([cat.name for cat in dataset.categories])
+ labels_class_names = overlapping_labels[category_attribute].unique().astype(str)
+ assert np.all(np.isin(dataset_class_names, labels_class_names))
# Rerunning with existing CocoDataset to verify append
previous_dataset = dataset.copy(deep=True)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 81d1400..f8f8862 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,9 +12,11 @@
estimate_average_bounds,
estimate_schema,
mask_label,
+ assert_valid_categories
)
from geococo.window_schema import WindowSchema
import geopandas as gpd
+from string import ascii_lowercase
from typing import Tuple
@@ -280,3 +282,31 @@ def test_window_factory_boundless() -> None:
)
assert np.any(window_extents[:, 0] >= window.width)
assert np.any(window_extents[:, 1] >= window.height)
+
+
+
+def test_assert_valid_categories() -> None:
+ # almost all python objects can be represented by str so we just try casting and verify char length
+ category_lengths = [10, 49, 50]
+ random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths]
+ random_words = np.array(random_words)
+
+ _ = assert_valid_categories(random_words)
+
+ # float64
+ random_numbers = np.random.randn(3).astype(np.float64)
+ _ = assert_valid_categories(random_numbers)
+
+ # longer than
Date: Wed, 13 Sep 2023 03:33:45 +0200
Subject: [PATCH 6/8] Mypy fixes
---
geococo/coco_models.py | 2 +-
tests/test_utils.py | 6 ++----
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 77c2578..603dee6 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -60,7 +60,7 @@ def add_categories(self, categories: np.ndarray) -> None:
categories = assert_valid_categories(categories=np.unique(categories))
# filtering existing categories
- category_mask = np.isin(categories, self._category_mapper.keys())
+ category_mask = np.isin(categories, list(self._category_mapper.keys()))
new_categories = categories[~category_mask]
# generating mapper from new categories
diff --git a/tests/test_utils.py b/tests/test_utils.py
index f8f8862..3cf79d4 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -288,8 +288,7 @@ def test_window_factory_boundless() -> None:
def test_assert_valid_categories() -> None:
# almost all python objects can be represented by str so we just try casting and verify char length
category_lengths = [10, 49, 50]
- random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths]
- random_words = np.array(random_words)
+ random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths])
_ = assert_valid_categories(random_words)
@@ -299,8 +298,7 @@ def test_assert_valid_categories() -> None:
# longer than
Date: Wed, 13 Sep 2023 03:34:41 +0200
Subject: [PATCH 7/8] Dropped legacy env.yml
---
full_environment.yml | 22 ----------------------
1 file changed, 22 deletions(-)
delete mode 100644 full_environment.yml
diff --git a/full_environment.yml b/full_environment.yml
deleted file mode 100644
index b90fa2d..0000000
--- a/full_environment.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: geococo_dev
-channels:
- - conda-forge
- - fastai
-dependencies:
- - python
- - geopandas
- - rasterio
- - pydantic
- - opencv-python-headless
- - pycocotools
- - tqdm
- - pytest
- - mypy
- - black
- - ruff
- - pip
- - pip:
- - fiftyone
-platforms:
- - linux-64
- - win-64
\ No newline at end of file
From 707154b40df7a866d5552f93ecf57542af456c85 Mon Sep 17 00:00:00 2001
From: Jasper
Date: Wed, 13 Sep 2023 03:41:29 +0200
Subject: [PATCH 8/8] Formatters (black, docformatter, ruff)
---
geococo/coco_models.py | 14 +++---
geococo/coco_processing.py | 45 +++++++++---------
geococo/utils.py | 93 +++++++++++++++++---------------------
tests/conftest.py | 8 +++-
tests/test_coco_models.py | 30 ++++++------
tests/test_utils.py | 32 +++++++------
6 files changed, 111 insertions(+), 111 deletions(-)
diff --git a/geococo/coco_models.py b/geococo/coco_models.py
index 603dee6..ee04052 100644
--- a/geococo/coco_models.py
+++ b/geococo/coco_models.py
@@ -2,15 +2,15 @@
import numpy as np
import pathlib
from datetime import datetime
-from typing import List, Optional, Union, Dict
+from typing import List, Optional, Dict
from typing_extensions import TypedDict
-from numpy.typing import ArrayLike
from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator
from semver.version import Version
from geococo.utils import assert_valid_categories
-class CocoDataset(BaseModel):
+
+class CocoDataset(BaseModel):
info: Info
images: List[InstanceOf[Image]] = []
annotations: List[InstanceOf[Annotation]] = []
@@ -58,8 +58,8 @@ def add_source(self, source_path: pathlib.Path) -> None:
def add_categories(self, categories: np.ndarray) -> None:
# checking if categories are castable to str and under a certain size
categories = assert_valid_categories(categories=np.unique(categories))
-
- # filtering existing categories
+
+ # filtering existing categories
category_mask = np.isin(categories, list(self._category_mapper.keys()))
new_categories = categories[~category_mask]
@@ -70,7 +70,9 @@ def add_categories(self, categories: np.ndarray) -> None:
# instance and append new Category objects to dataset
for category_name, category_id in category_dict.items():
- category = Category(id = category_id, name = str(category_name), supercategory="1")
+ category = Category(
+ id=category_id, name=str(category_name), supercategory="1"
+ )
self.categories.append(category)
# update existing category_mapper with new categories
diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py
index 3d0b7bc..ef24802 100644
--- a/geococo/coco_processing.py
+++ b/geococo/coco_processing.py
@@ -27,34 +27,29 @@ def labels_to_dataset(
src: DatasetReader,
labels: gpd.GeoDataFrame,
window_bounds: List[Tuple[int, int]],
- category_attribute: str = "category_id"
+ category_attribute: str = "category_id",
) -> CocoDataset:
"""Move across a given geotiff, converting all intersecting labels to COCO
annotations and appending them to a COCODataset model. This is done through
- rasterio.Window objects, the bounds of which you can set with window_bounds
- (also determines the size of the output images associated with the
- Annotation instances). The degree of overlap between these windows is
- determined by the dimensions of the given labels to maximize representation
- in the resulting dataset.
-
- The "iscrowd" attribute (see
- https://cocodataset.org/#format-data)
- is determined by whether the respective labels are Polygon or
- MultiPolygon instances. The "category_id" attribute, which
- represents class or category identifiers, is expected to be present
- in the given labels GeoDataFrame under the same name.
-
- :param dataset: CocoDataset model to append images and annotations
- to
+ rasterio.Window objects, the bounds of which you can set with window_bounds (also
+ determines the size of the output images associated with the Annotation instances).
+ The degree of overlap between these windows is determined by the dimensions of the
+ given labels to maximize representation in the resulting dataset.
+
+ The "iscrowd" attribute (see https://cocodataset.org/#format-data) is determined by
+ whether the respective labels are Polygon or MultiPolygon instances. The
+ "category_id" attribute, which represents class or category identifiers, is
+ expected to be present in the given labels GeoDataFrame under the same name.
+
+ :param dataset: CocoDataset model to append images and annotations to
:param images_dir: output directory for all label images
:param src: open rasterio reader for input raster
- :param labels: GeoDataFrame containing labels and class_info
- ('category_id')
+ :param labels: GeoDataFrame containing labels and class_info ('category_id')
:param window_bounds: a list of window_bounds to attempt to use ()
:param category_attribute: Column containing category_id values
:return: The COCO dataset with appended Images and Annotations
"""
-
+
# Setting nodata and estimating window configuration
parent_window = window_intersect(input_raster=src, input_vector=labels)
nodata_value = src.nodata if src.nodata else 0
@@ -62,16 +57,16 @@ def labels_to_dataset(
coco_profile.update({"dtype": np.uint8, "nodata": nodata_value, "driver": "JPEG"})
schema = estimate_schema(gdf=labels, src=src, window_bounds=window_bounds)
n_windows = generate_window_offsets(window=parent_window, schema=schema).shape[0]
-
+
# sets dataset.next_source_id and possibly bumps minor version
dataset.add_source(source_path=pathlib.Path(src.name))
-
+
# bumps major version if images_dir has been used in this dataset before
dataset.verify_new_output_dir(images_dir=images_dir)
# sets dataset._category_mapper
dataset.add_categories(categories=labels[category_attribute].unique())
-
+
for child_window in tqdm(
window_factory(parent_window=parent_window, schema=schema), total=n_windows
):
@@ -134,7 +129,9 @@ def labels_to_dataset(
# Iteratively add Annotation models to dataset (also bumps next_annotation_id)
with rasterio.open(window_image_path) as windowed_src:
- for _, window_label in window_labels.sort_values(category_attribute).iterrows():
+ for _, window_label in window_labels.sort_values(
+ category_attribute
+ ).iterrows():
label_mask = mask_label(
input_raster=windowed_src, label=window_label.geometry
)
@@ -147,7 +144,7 @@ def labels_to_dataset(
iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0
category_name = str(window_label[category_attribute])
category_id = dataset._category_mapper[category_name]
-
+
annotation_instance = Annotation(
id=dataset.next_annotation_id,
image_id=dataset.next_image_id,
diff --git a/geococo/utils.py b/geococo/utils.py
index 10e2e8c..53948d2 100644
--- a/geococo/utils.py
+++ b/geococo/utils.py
@@ -13,14 +13,11 @@
def mask_label(
input_raster: DatasetReader, label: Union[Polygon, MultiPolygon]
) -> np.ndarray:
- """Masks out an label from input_raster and flattens it to a 2D binary
- array. If it doesn't overlap, the resulting mask will only consist of False
- bools.
-
- :param input_raster: open rasterio DatasetReader for the input
- raster
- :param label: Polygon object representing the area to be masked
- (i.e. label)
+ """Masks out an label from input_raster and flattens it to a 2D binary array. If it
+ doesn't overlap, the resulting mask will only consist of False bools.
+
+ :param input_raster: open rasterio DatasetReader for the input raster
+ :param label: Polygon object representing the area to be masked (i.e. label)
:return: A 2D binary array representing the label
"""
@@ -39,14 +36,12 @@ def mask_label(
def window_intersect(
input_raster: DatasetReader, input_vector: gpd.GeoDataFrame
) -> Window:
- """Generates a Rasterio Window from the intersecting extents of the input
- data. It also verifies if the input data share the same CRS and if they
- physically overlap.
+ """Generates a Rasterio Window from the intersecting extents of the input data. It
+ also verifies if the input data share the same CRS and if they physically overlap.
:param input_raster: rasterio dataset (i.e. input image)
:param input_vector: geopandas geodataframe (i.e. input labels)
- :return: rasterio window that represent the intersection between
- input data extents
+ :return: rasterio window that represent the intersection between input data extents
"""
if input_vector.crs != input_raster.crs:
@@ -73,13 +68,11 @@ def window_intersect(
def reshape_image(
img_array: np.ndarray, shape: Tuple[int, int, int], padding_value: int = 0
) -> np.ndarray:
- """Reshapes 3D numpy array to match given 3D shape, done through slicing or
- padding.
+ """Reshapes 3D numpy array to match given 3D shape, done through slicing or padding.
:param img_array: the numpy array to be reshaped
:param shape: the desired shape (bands, rows, cols)
- :param padding_value: what value to pad img_array with (if too
- small)
+ :param padding_value: what value to pad img_array with (if too small)
:return: numpy array in desired shape
"""
@@ -98,14 +91,14 @@ def reshape_image(
def generate_window_polygon(datasource: DatasetReader, window: Window) -> Polygon:
- """Turns the spatial bounds of a given window to a shapely.Polygon object
- in a given dataset's CRS.
+ """Turns the spatial bounds of a given window to a shapely.Polygon object in a given
+ dataset's CRS.
- :param datasource: a rasterio DatasetReader object that provides the
- affine transformation
+ :param datasource: a rasterio DatasetReader object that provides the affine
+ transformation
:param window: bounds to represent as Polygon
- :return: shapely Polygon representing the spatial bounds of a given
- window in a given CRS
+ :return: shapely Polygon representing the spatial bounds of a given window in a
+ given CRS
"""
window_transform = datasource.window_transform(window)
@@ -117,8 +110,7 @@ def generate_window_polygon(datasource: DatasetReader, window: Window) -> Polygo
def generate_window_offsets(window: Window, schema: WindowSchema) -> np.ndarray:
"""Computes an array of window offsets bound by a given window.
- :param window: the bounding window (i.e. offsets will be within its
- bounds)
+ :param window: the bounding window (i.e. offsets will be within its bounds)
:param schema: the parameters for the window generator
:return: an array of window offsets within the bounds of window
"""
@@ -143,14 +135,14 @@ def generate_window_offsets(window: Window, schema: WindowSchema) -> np.ndarray:
def window_factory(
parent_window: Window, schema: WindowSchema, boundless: bool = True
) -> Generator[Window, None, None]:
- """Generator that produces rasterio.Window objects in predetermined steps,
- within the given Window.
+ """Generator that produces rasterio.Window objects in predetermined steps, within
+ the given Window.
- :param parent_window: the window that provides the bounds for all
- child_window objects
+ :param parent_window: the window that provides the bounds for all child_window
+ objects
:param schema: the parameters that determine the window steps
- :param boundless: whether the child_window should be clipped by the
- parent_window or not
+ :param boundless: whether the child_window should be clipped by the parent_window or
+ not
:yield: a rasterio.Window used for windowed reading/writing
"""
@@ -174,8 +166,7 @@ def estimate_average_bounds(
) -> Tuple[float, float]:
"""Estimates the average size of all features in a GeoDataFrame.
- :param gdf: GeoDataFrame that contains all features (i.e.
- shapely.Geometry objects)
+ :param gdf: GeoDataFrame that contains all features (i.e. shapely.Geometry objects)
:param quantile: what quantile will represent the feature population
:return: a tuple of floats representing average width and height
"""
@@ -195,19 +186,16 @@ def estimate_schema(
quantile: float = 0.9,
window_bounds: List[Tuple[int, int]] = [(256, 256), (512, 512)],
) -> WindowSchema:
- """Attempts to find a schema that is able to represent the average
- GeoDataFrame feature (i.e. sufficient overlap) but within the bounds given
- by window_bounds.
-
- :param gdf: GeoDataFrame that contains features that determine the
- degree of overlap
- :param src: The rasterio DataSource associated with the resulting
- schema (i.e. bounds and pixelsizes)
+ """Attempts to find a schema that is able to represent the average GeoDataFrame
+ feature (i.e. sufficient overlap) but within the bounds given by window_bounds.
+
+ :param gdf: GeoDataFrame that contains features that determine the degree of overlap
+ :param src: The rasterio DataSource associated with the resulting schema (i.e.
+ bounds and pixelsizes)
:param quantile: what quantile will represent the feature population
- :param window_bounds: a list of possible limits for the window
- generators
- :return: (if found) a viable WindowSchema with sufficient overlap
- within the window_bounds
+ :param window_bounds: a list of possible limits for the window generators
+ :return: (if found) a viable WindowSchema with sufficient overlap within the
+ window_bounds
"""
# estimating the required overlap between windows for labels to be represented fully
@@ -243,10 +231,11 @@ def estimate_schema(
return schema
-def assert_valid_categories(categories: np.ndarray, max_dtype: str = " np.ndarray:
- """
- Checks if all elements in categories array can be represented by strings
- of a certain length (defaults to np.ndarray:
+ """Checks if all elements in categories array can be represented by strings of a
+ certain length (defaults to
str_categories = categories.astype(str)
except Exception as e:
raise ValueError("Category values need to be castable to str") from e
-
+
# checking if categories can be castable to str of a certain length (e.g. gpd.GeoDataFrame:
polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
labels = gpd.GeoDataFrame(
- geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
+ geometry=polygons,
+ data={"category_id": classes, "class_names": class_names},
+ crs=crs,
) # type: ignore
return labels
@@ -114,7 +116,9 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame:
polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)]
labels = gpd.GeoDataFrame(
- geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs
+ geometry=polygons,
+ data={"category_id": classes, "class_names": class_names},
+ crs=crs,
) # type: ignore
return labels
diff --git a/tests/test_coco_models.py b/tests/test_coco_models.py
index 12aa3e7..a7c8e55 100644
--- a/tests/test_coco_models.py
+++ b/tests/test_coco_models.py
@@ -2,8 +2,6 @@
import numpy as np
import pathlib
from datetime import datetime
-import geopandas as gpd
-import pytest
from geococo.coco_models import (
CocoDataset,
Info,
@@ -62,7 +60,7 @@ def test_dataset_add_images():
dataset = CocoDataset(info=Info())
assert dataset.next_annotation_id == 1
assert dataset.next_image_id == 1
-
+
n_images = np.random.randint(2, 10)
for _ in range(n_images):
@@ -78,6 +76,7 @@ def test_dataset_add_images():
assert n_images == dataset.next_image_id - 1
assert n_images == len(dataset.images)
+
def test_info():
"""Simple instance test."""
@@ -132,9 +131,9 @@ def test_source():
def test_dataset_add_sources():
- """Checks proper incrementation of source_id"""
+ """Checks proper incrementation of source_id."""
- # Bit different from the other ids since we check for duplication
+ # Bit different from the other ids since we check for duplication
# and only increment if new
dataset = CocoDataset(info=Info())
assert dataset.next_source_id == 0
@@ -147,11 +146,11 @@ def test_dataset_add_sources():
def test_dataset_versions():
- """Checks proper incrementation of dataset versions"""
+ """Checks proper incrementation of dataset versions."""
dataset = CocoDataset(info=Info())
assert dataset.info.version == "0.0.0"
-
+
# minor bump if same output_dir but different raster_source
dataset.add_source(source_path=pathlib.Path("a"))
assert dataset.info.version == "0.1.0"
@@ -164,13 +163,14 @@ def test_dataset_versions():
dataset.verify_new_output_dir(images_dir=pathlib.Path("b"))
assert dataset.info.version == "1.0.0"
+
def test_add_categories():
- """Checks independent mapping of category_attribute to class_ids"""
+ """Checks independent mapping of category_attribute to class_ids."""
dataset = CocoDataset(info=Info())
assert dataset.categories == []
assert dataset._category_mapper == {}
-
+
# adding three unique classes
categories = np.array(["A", "B", "B", "E", "E"])
dataset.add_categories(categories=categories)
@@ -179,12 +179,16 @@ def test_add_categories():
assert np.unique(categories).size == len(dataset._category_mapper)
assert np.unique(categories).size == len(dataset.categories)
assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
-
- #check if existing key value pairs don't change
+
+ # check if existing key value pairs don't change
# done by adding a bunch of existing classes and one new one
initial_mapper = dataset._category_mapper.copy()
categories = np.array(["One", "Two", "Two", "Five", "Five", "Six", "Six"])
dataset.add_categories(categories=categories)
- subset_mapper = {key: value for key, value in dataset._category_mapper.items() if key in initial_mapper.keys()}
+ subset_mapper = {
+ key: value
+ for key, value in dataset._category_mapper.items()
+ if key in initial_mapper.keys()
+ }
assert initial_mapper == subset_mapper
- assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
\ No newline at end of file
+ assert np.all(np.diff(list(dataset._category_mapper.values())) == 1)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3cf79d4..9198e35 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,7 +12,7 @@
estimate_average_bounds,
estimate_schema,
mask_label,
- assert_valid_categories
+ assert_valid_categories,
)
from geococo.window_schema import WindowSchema
import geopandas as gpd
@@ -57,7 +57,7 @@ def test_window_intersect(
overlapping_labels: gpd.GeoDataFrame, test_raster: pathlib.Path
) -> None:
with rasterio.open(test_raster) as raster_source:
- # outer polygons have diameter of 2 so resulting window
+ # outer polygons have diameter of 2 so resulting window
# is 82x82 and offset is 9 (10-2/2)
window = window_intersect(
input_raster=raster_source, input_vector=overlapping_labels
@@ -284,27 +284,31 @@ def test_window_factory_boundless() -> None:
assert np.any(window_extents[:, 1] >= window.height)
-
def test_assert_valid_categories() -> None:
- # almost all python objects can be represented by str so we just try casting and verify char length
+ # almost all python objects can be represented by str
+ # so we just try casting and verify char length
category_lengths = [10, 49, 50]
- random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths])
-
+ random_words = np.array(
+ [
+ "".join(np.random.choice(list(ascii_lowercase), cl))
+ for cl in category_lengths
+ ]
+ )
+
_ = assert_valid_categories(random_words)
- # float64
+ # float64
random_numbers = np.random.randn(3).astype(np.float64)
_ = assert_valid_categories(random_numbers)
# longer than