From 5d60b0681c3ae8abcba65e31ef433c04913a8fb7 Mon Sep 17 00:00:00 2001 From: Jasper Date: Tue, 12 Sep 2023 23:38:40 +0200 Subject: [PATCH 1/8] Added option to specify category_attribute (defaults to 'category_id'). Also bumped package version. --- geococo/coco_processing.py | 6 ++++-- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py index 3cfdc10..3406ad4 100644 --- a/geococo/coco_processing.py +++ b/geococo/coco_processing.py @@ -27,6 +27,7 @@ def labels_to_dataset( src: DatasetReader, labels: gpd.GeoDataFrame, window_bounds: List[Tuple[int, int]], + category_attribute: str = "category_id" ) -> CocoDataset: """Move across a given geotiff, converting all intersecting labels to COCO annotations and appending them to a COCODataset model. This is done through @@ -50,6 +51,7 @@ def labels_to_dataset( :param labels: GeoDataFrame containing labels and class_info ('category_id') :param window_bounds: a list of window_bounds to attempt to use () + :param category_attribute: Column containing category_id values :return: The COCO dataset with appended Images and Annotations """ @@ -129,7 +131,7 @@ def labels_to_dataset( # Iteratively add Annotation models to dataset (also bumps next_annotation_id) with rasterio.open(window_image_path) as windowed_src: - for _, window_label in window_labels.sort_values("category_id").iterrows(): + for _, window_label in window_labels.sort_values(category_attribute).iterrows(): label_mask = mask_label( input_raster=windowed_src, label=window_label.geometry ) @@ -144,7 +146,7 @@ def labels_to_dataset( annotation_instance = Annotation( id=dataset.next_annotation_id, image_id=dataset.next_image_id, - category_id=window_label["category_id"], + category_id=window_label[category_attribute], segmentation=rle, # type: ignore area=area, bbox=bounding_box, diff --git a/pyproject.toml b/pyproject.toml index 0f74671..579a0ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "geococo" -version = "0.2.1" +version = "0.3.0" description = "Converts GIS annotations to Microsoft's Common Objects In Context (COCO) dataset format" authors = ["Jasper "] readme = "README.md" From 4e652c70e81f24c24b12605521b9939f7da33513 Mon Sep 17 00:00:00 2001 From: Jasper Date: Wed, 13 Sep 2023 00:32:30 +0200 Subject: [PATCH 2/8] Dropped 'Planned features' ref for brevity sake (there's too many planned features this early in development and I don't want to update it every release) --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index bc36288..5e1be37 100644 --- a/README.md +++ b/README.md @@ -158,8 +158,3 @@ session = fo.launch_app(coco_dataset, port=5151)

- - -# Planned features -- [QGIS plugin](https://github.com/jaspersiebring/geococo-qgis-plugin). -- Data visualization with `pycocotool`'s plotting functionality From c26527f8c3f1f05e07c51a5d2c44393fd90457dc Mon Sep 17 00:00:00 2001 From: Jasper Date: Wed, 13 Sep 2023 00:59:46 +0200 Subject: [PATCH 3/8] labels_to_dataset now also adds Category instances to the COCO dataset. The ids for these instances are set by a category_mapper which is built from any existing categories in said dataset. --- geococo/coco_models.py | 28 +++++++++++++++++++++++++++- geococo/coco_processing.py | 10 +++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/geococo/coco_models.py b/geococo/coco_models.py index 6d38e98..69e93ff 100644 --- a/geococo/coco_models.py +++ b/geococo/coco_models.py @@ -2,8 +2,9 @@ import numpy as np import pathlib from datetime import datetime -from typing import List, Optional +from typing import List, Optional, Union, Dict from typing_extensions import TypedDict +from numpy.typing import ArrayLike from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator from semver.version import Version @@ -18,14 +19,21 @@ class CocoDataset(BaseModel): _next_image_id: int = 1 _next_annotation_id: int = 1 _next_source_id: int = 1 + _category_mapper: Dict = {} @model_validator(mode="after") def _set_ids(self) -> CocoDataset: self._next_image_id = len(self.images) + 1 self._next_annotation_id = len(self.annotations) + 1 self._next_source_id = len(self.sources) + self._category_mapper = self._get_category_mapper() return self + def _get_category_mapper(self) -> Dict: + category_data = [(category.name, category.id) for category in self.categories] + category_mapper = dict(category_data) if category_data else {} + return category_mapper + def add_annotation(self, annotation: Annotation) -> None: self.annotations.append(annotation) self._next_annotation_id += 1 @@ -47,6 +55,24 @@ def add_source(self, source_path: pathlib.Path) -> None: self._next_source_id = source.id + def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> None: + # filtering existing categories + category_mask = np.isin(categories, self._category_mapper.keys()) + new_categories = categories[~category_mask] + + # generating mapper from new categories + start = len(self._category_mapper.values()) + 1 + end = start + new_categories.size + category_dict = dict(zip(new_categories, np.arange(start, end))) + + # instance and append new Category objects to dataset + for category_name, category_id in category_dict.items(): + category = Category(id = category_id, name = category_name, supercategory="1") + self.categories.append(category) + + # update existing category_mapper with new categories + self._category_mapper.update(category_dict) + def bump_version(self, bump_method: str) -> None: bump_methods = ["patch", "minor", "major"] version = Version.parse(self.info.version) diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py index 3406ad4..d0205dd 100644 --- a/geococo/coco_processing.py +++ b/geococo/coco_processing.py @@ -54,7 +54,7 @@ def labels_to_dataset( :param category_attribute: Column containing category_id values :return: The COCO dataset with appended Images and Annotations """ - + # Setting nodata and estimating window configuration parent_window = window_intersect(input_raster=src, input_vector=labels) nodata_value = src.nodata if src.nodata else 0 @@ -68,6 +68,9 @@ def labels_to_dataset( # bumps major version if images_dir has been used in this dataset before dataset.verify_new_output_dir(images_dir=images_dir) + + # sets dataset._category_mapper + dataset.add_categories(categories=labels[category_attribute].unique()) for child_window in tqdm( window_factory(parent_window=parent_window, schema=schema), total=n_windows @@ -142,11 +145,12 @@ def labels_to_dataset( bounding_box = cv2.boundingRect(label_mask.astype(np.uint8)) area = np.sum(label_mask) iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0 - + category_id= dataset._category_mapper[window_label[category_attribute]] + annotation_instance = Annotation( id=dataset.next_annotation_id, image_id=dataset.next_image_id, - category_id=window_label[category_attribute], + category_id=category_id, segmentation=rle, # type: ignore area=area, bbox=bounding_box, From 4dd4fd440ce5dd6fa2149e49e300c2d43be6b8f8 Mon Sep 17 00:00:00 2001 From: Jasper Date: Wed, 13 Sep 2023 01:12:20 +0200 Subject: [PATCH 4/8] Added class_names attribute with string values to gpd.GeoDataFrame objects returned by fixtures. --- tests/conftest.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 8abfcf5..fb921db 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -77,6 +77,8 @@ def overlapping_labels() -> gpd.GeoDataFrame: crs = CRS.from_epsg(3857) classes = [1, 2, 2, 5, 5] + class_names = ["One", "Two", "Two", "Five", "Five"] + points = [ Point(10, -10), Point(30, -30), @@ -88,7 +90,7 @@ def overlapping_labels() -> gpd.GeoDataFrame: polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)] labels = gpd.GeoDataFrame( - geometry=polygons, data={"category_id": classes}, crs=crs + geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs ) # type: ignore return labels @@ -100,6 +102,7 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame: crs = CRS.from_epsg(3857) classes = [1, 2, 2, 5, 5] + class_names = ["One", "Two", "Two", "Five", "Five"] points = [ Point(510, -510), Point(530, -530), @@ -111,7 +114,7 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame: polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)] labels = gpd.GeoDataFrame( - geometry=polygons, data={"category_id": classes}, crs=crs + geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs ) # type: ignore return labels From db3e8f0c2bb05173493a0cd84ef6fd88f354a0af Mon Sep 17 00:00:00 2001 From: Jasper Date: Wed, 13 Sep 2023 03:26:36 +0200 Subject: [PATCH 5/8] Added and refactored tests related to category_attribute values. --- geococo/coco_models.py | 11 +++++++---- geococo/coco_processing.py | 3 ++- geococo/utils.py | 25 +++++++++++++++++++++++++ tests/test_coco_models.py | 29 ++++++++++++++++++++++++++++- tests/test_coco_processing.py | 18 ++++++++++-------- tests/test_utils.py | 30 ++++++++++++++++++++++++++++++ 6 files changed, 102 insertions(+), 14 deletions(-) diff --git a/geococo/coco_models.py b/geococo/coco_models.py index 69e93ff..77c2578 100644 --- a/geococo/coco_models.py +++ b/geococo/coco_models.py @@ -8,9 +8,9 @@ from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator from semver.version import Version +from geococo.utils import assert_valid_categories - -class CocoDataset(BaseModel): +class CocoDataset(BaseModel): info: Info images: List[InstanceOf[Image]] = [] annotations: List[InstanceOf[Annotation]] = [] @@ -55,7 +55,10 @@ def add_source(self, source_path: pathlib.Path) -> None: self._next_source_id = source.id - def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> None: + def add_categories(self, categories: np.ndarray) -> None: + # checking if categories are castable to str and under a certain size + categories = assert_valid_categories(categories=np.unique(categories)) + # filtering existing categories category_mask = np.isin(categories, self._category_mapper.keys()) new_categories = categories[~category_mask] @@ -67,7 +70,7 @@ def add_categories(self, categories: ArrayLike[Union[int, str]] np.ndarray) -> N # instance and append new Category objects to dataset for category_name, category_id in category_dict.items(): - category = Category(id = category_id, name = category_name, supercategory="1") + category = Category(id = category_id, name = str(category_name), supercategory="1") self.categories.append(category) # update existing category_mapper with new categories diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py index d0205dd..3d0b7bc 100644 --- a/geococo/coco_processing.py +++ b/geococo/coco_processing.py @@ -145,7 +145,8 @@ def labels_to_dataset( bounding_box = cv2.boundingRect(label_mask.astype(np.uint8)) area = np.sum(label_mask) iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0 - category_id= dataset._category_mapper[window_label[category_attribute]] + category_name = str(window_label[category_attribute]) + category_id = dataset._category_mapper[category_name] annotation_instance = Annotation( id=dataset.next_annotation_id, diff --git a/geococo/utils.py b/geococo/utils.py index 78f2d9e..10e2e8c 100644 --- a/geococo/utils.py +++ b/geococo/utils.py @@ -241,3 +241,28 @@ def estimate_schema( ) from last_exception return schema + + +def assert_valid_categories(categories: np.ndarray, max_dtype: str = " np.ndarray: + """ + Checks if all elements in categories array can be represented by strings + of a certain length (defaults to None: json_path = tmp_path / "dataset.json" + category_attribute = "category_id" with rasterio.open(test_raster) as raster_source: # Creating empty CocoDataset as input for labels_to_dataset @@ -24,13 +25,13 @@ def test_labels_to_dataset_new_dataset( src=raster_source, labels=overlapping_labels, window_bounds=[(256, 256)], + category_attribute=category_attribute ) # Checking if output has correct classes - unique_ann_ids = np.unique([ann.category_id for ann in dataset.annotations]) - assert np.all( - np.isin(unique_ann_ids, overlapping_labels["category_id"].unique()) - ) + dataset_class_names = np.array([cat.name for cat in dataset.categories]) + labels_class_names = overlapping_labels[category_attribute].unique().astype(str) + assert np.all(np.isin(dataset_class_names, labels_class_names)) # Dumping to JSON dst_json_data = dataset.model_dump_json() @@ -51,6 +52,7 @@ def test_labels_to_dataset_append_dataset( test_raster: pathlib.Path, overlapping_labels: gpd.GeoDataFrame, ) -> None: + category_attribute = "category_id" with rasterio.open(test_raster) as raster_source: # Creating empty CocoDataset as input for labels_to_dataset info = Info(version="0.0.1", date_created=datetime.now()) @@ -61,13 +63,13 @@ def test_labels_to_dataset_append_dataset( src=raster_source, labels=overlapping_labels, window_bounds=[(256, 256)], + category_attribute=category_attribute ) # Checking if output has correct classes - unique_ann_ids = np.unique([ann.category_id for ann in dataset.annotations]) - assert np.all( - np.isin(unique_ann_ids, overlapping_labels["category_id"].unique()) - ) + dataset_class_names = np.array([cat.name for cat in dataset.categories]) + labels_class_names = overlapping_labels[category_attribute].unique().astype(str) + assert np.all(np.isin(dataset_class_names, labels_class_names)) # Rerunning with existing CocoDataset to verify append previous_dataset = dataset.copy(deep=True) diff --git a/tests/test_utils.py b/tests/test_utils.py index 81d1400..f8f8862 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,9 +12,11 @@ estimate_average_bounds, estimate_schema, mask_label, + assert_valid_categories ) from geococo.window_schema import WindowSchema import geopandas as gpd +from string import ascii_lowercase from typing import Tuple @@ -280,3 +282,31 @@ def test_window_factory_boundless() -> None: ) assert np.any(window_extents[:, 0] >= window.width) assert np.any(window_extents[:, 1] >= window.height) + + + +def test_assert_valid_categories() -> None: + # almost all python objects can be represented by str so we just try casting and verify char length + category_lengths = [10, 49, 50] + random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths] + random_words = np.array(random_words) + + _ = assert_valid_categories(random_words) + + # float64 + random_numbers = np.random.randn(3).astype(np.float64) + _ = assert_valid_categories(random_numbers) + + # longer than Date: Wed, 13 Sep 2023 03:33:45 +0200 Subject: [PATCH 6/8] Mypy fixes --- geococo/coco_models.py | 2 +- tests/test_utils.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/geococo/coco_models.py b/geococo/coco_models.py index 77c2578..603dee6 100644 --- a/geococo/coco_models.py +++ b/geococo/coco_models.py @@ -60,7 +60,7 @@ def add_categories(self, categories: np.ndarray) -> None: categories = assert_valid_categories(categories=np.unique(categories)) # filtering existing categories - category_mask = np.isin(categories, self._category_mapper.keys()) + category_mask = np.isin(categories, list(self._category_mapper.keys())) new_categories = categories[~category_mask] # generating mapper from new categories diff --git a/tests/test_utils.py b/tests/test_utils.py index f8f8862..3cf79d4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -288,8 +288,7 @@ def test_window_factory_boundless() -> None: def test_assert_valid_categories() -> None: # almost all python objects can be represented by str so we just try casting and verify char length category_lengths = [10, 49, 50] - random_words = [ "".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths] - random_words = np.array(random_words) + random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths]) _ = assert_valid_categories(random_words) @@ -299,8 +298,7 @@ def test_assert_valid_categories() -> None: # longer than Date: Wed, 13 Sep 2023 03:34:41 +0200 Subject: [PATCH 7/8] Dropped legacy env.yml --- full_environment.yml | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 full_environment.yml diff --git a/full_environment.yml b/full_environment.yml deleted file mode 100644 index b90fa2d..0000000 --- a/full_environment.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: geococo_dev -channels: - - conda-forge - - fastai -dependencies: - - python - - geopandas - - rasterio - - pydantic - - opencv-python-headless - - pycocotools - - tqdm - - pytest - - mypy - - black - - ruff - - pip - - pip: - - fiftyone -platforms: - - linux-64 - - win-64 \ No newline at end of file From 707154b40df7a866d5552f93ecf57542af456c85 Mon Sep 17 00:00:00 2001 From: Jasper Date: Wed, 13 Sep 2023 03:41:29 +0200 Subject: [PATCH 8/8] Formatters (black, docformatter, ruff) --- geococo/coco_models.py | 14 +++--- geococo/coco_processing.py | 45 +++++++++--------- geococo/utils.py | 93 +++++++++++++++++--------------------- tests/conftest.py | 8 +++- tests/test_coco_models.py | 30 ++++++------ tests/test_utils.py | 32 +++++++------ 6 files changed, 111 insertions(+), 111 deletions(-) diff --git a/geococo/coco_models.py b/geococo/coco_models.py index 603dee6..ee04052 100644 --- a/geococo/coco_models.py +++ b/geococo/coco_models.py @@ -2,15 +2,15 @@ import numpy as np import pathlib from datetime import datetime -from typing import List, Optional, Union, Dict +from typing import List, Optional, Dict from typing_extensions import TypedDict -from numpy.typing import ArrayLike from pydantic import BaseModel, ConfigDict, InstanceOf, model_validator from semver.version import Version from geococo.utils import assert_valid_categories -class CocoDataset(BaseModel): + +class CocoDataset(BaseModel): info: Info images: List[InstanceOf[Image]] = [] annotations: List[InstanceOf[Annotation]] = [] @@ -58,8 +58,8 @@ def add_source(self, source_path: pathlib.Path) -> None: def add_categories(self, categories: np.ndarray) -> None: # checking if categories are castable to str and under a certain size categories = assert_valid_categories(categories=np.unique(categories)) - - # filtering existing categories + + # filtering existing categories category_mask = np.isin(categories, list(self._category_mapper.keys())) new_categories = categories[~category_mask] @@ -70,7 +70,9 @@ def add_categories(self, categories: np.ndarray) -> None: # instance and append new Category objects to dataset for category_name, category_id in category_dict.items(): - category = Category(id = category_id, name = str(category_name), supercategory="1") + category = Category( + id=category_id, name=str(category_name), supercategory="1" + ) self.categories.append(category) # update existing category_mapper with new categories diff --git a/geococo/coco_processing.py b/geococo/coco_processing.py index 3d0b7bc..ef24802 100644 --- a/geococo/coco_processing.py +++ b/geococo/coco_processing.py @@ -27,34 +27,29 @@ def labels_to_dataset( src: DatasetReader, labels: gpd.GeoDataFrame, window_bounds: List[Tuple[int, int]], - category_attribute: str = "category_id" + category_attribute: str = "category_id", ) -> CocoDataset: """Move across a given geotiff, converting all intersecting labels to COCO annotations and appending them to a COCODataset model. This is done through - rasterio.Window objects, the bounds of which you can set with window_bounds - (also determines the size of the output images associated with the - Annotation instances). The degree of overlap between these windows is - determined by the dimensions of the given labels to maximize representation - in the resulting dataset. - - The "iscrowd" attribute (see - https://cocodataset.org/#format-data) - is determined by whether the respective labels are Polygon or - MultiPolygon instances. The "category_id" attribute, which - represents class or category identifiers, is expected to be present - in the given labels GeoDataFrame under the same name. - - :param dataset: CocoDataset model to append images and annotations - to + rasterio.Window objects, the bounds of which you can set with window_bounds (also + determines the size of the output images associated with the Annotation instances). + The degree of overlap between these windows is determined by the dimensions of the + given labels to maximize representation in the resulting dataset. + + The "iscrowd" attribute (see https://cocodataset.org/#format-data) is determined by + whether the respective labels are Polygon or MultiPolygon instances. The + "category_id" attribute, which represents class or category identifiers, is + expected to be present in the given labels GeoDataFrame under the same name. + + :param dataset: CocoDataset model to append images and annotations to :param images_dir: output directory for all label images :param src: open rasterio reader for input raster - :param labels: GeoDataFrame containing labels and class_info - ('category_id') + :param labels: GeoDataFrame containing labels and class_info ('category_id') :param window_bounds: a list of window_bounds to attempt to use () :param category_attribute: Column containing category_id values :return: The COCO dataset with appended Images and Annotations """ - + # Setting nodata and estimating window configuration parent_window = window_intersect(input_raster=src, input_vector=labels) nodata_value = src.nodata if src.nodata else 0 @@ -62,16 +57,16 @@ def labels_to_dataset( coco_profile.update({"dtype": np.uint8, "nodata": nodata_value, "driver": "JPEG"}) schema = estimate_schema(gdf=labels, src=src, window_bounds=window_bounds) n_windows = generate_window_offsets(window=parent_window, schema=schema).shape[0] - + # sets dataset.next_source_id and possibly bumps minor version dataset.add_source(source_path=pathlib.Path(src.name)) - + # bumps major version if images_dir has been used in this dataset before dataset.verify_new_output_dir(images_dir=images_dir) # sets dataset._category_mapper dataset.add_categories(categories=labels[category_attribute].unique()) - + for child_window in tqdm( window_factory(parent_window=parent_window, schema=schema), total=n_windows ): @@ -134,7 +129,9 @@ def labels_to_dataset( # Iteratively add Annotation models to dataset (also bumps next_annotation_id) with rasterio.open(window_image_path) as windowed_src: - for _, window_label in window_labels.sort_values(category_attribute).iterrows(): + for _, window_label in window_labels.sort_values( + category_attribute + ).iterrows(): label_mask = mask_label( input_raster=windowed_src, label=window_label.geometry ) @@ -147,7 +144,7 @@ def labels_to_dataset( iscrowd = 1 if isinstance(window_label.geometry, MultiPolygon) else 0 category_name = str(window_label[category_attribute]) category_id = dataset._category_mapper[category_name] - + annotation_instance = Annotation( id=dataset.next_annotation_id, image_id=dataset.next_image_id, diff --git a/geococo/utils.py b/geococo/utils.py index 10e2e8c..53948d2 100644 --- a/geococo/utils.py +++ b/geococo/utils.py @@ -13,14 +13,11 @@ def mask_label( input_raster: DatasetReader, label: Union[Polygon, MultiPolygon] ) -> np.ndarray: - """Masks out an label from input_raster and flattens it to a 2D binary - array. If it doesn't overlap, the resulting mask will only consist of False - bools. - - :param input_raster: open rasterio DatasetReader for the input - raster - :param label: Polygon object representing the area to be masked - (i.e. label) + """Masks out an label from input_raster and flattens it to a 2D binary array. If it + doesn't overlap, the resulting mask will only consist of False bools. + + :param input_raster: open rasterio DatasetReader for the input raster + :param label: Polygon object representing the area to be masked (i.e. label) :return: A 2D binary array representing the label """ @@ -39,14 +36,12 @@ def mask_label( def window_intersect( input_raster: DatasetReader, input_vector: gpd.GeoDataFrame ) -> Window: - """Generates a Rasterio Window from the intersecting extents of the input - data. It also verifies if the input data share the same CRS and if they - physically overlap. + """Generates a Rasterio Window from the intersecting extents of the input data. It + also verifies if the input data share the same CRS and if they physically overlap. :param input_raster: rasterio dataset (i.e. input image) :param input_vector: geopandas geodataframe (i.e. input labels) - :return: rasterio window that represent the intersection between - input data extents + :return: rasterio window that represent the intersection between input data extents """ if input_vector.crs != input_raster.crs: @@ -73,13 +68,11 @@ def window_intersect( def reshape_image( img_array: np.ndarray, shape: Tuple[int, int, int], padding_value: int = 0 ) -> np.ndarray: - """Reshapes 3D numpy array to match given 3D shape, done through slicing or - padding. + """Reshapes 3D numpy array to match given 3D shape, done through slicing or padding. :param img_array: the numpy array to be reshaped :param shape: the desired shape (bands, rows, cols) - :param padding_value: what value to pad img_array with (if too - small) + :param padding_value: what value to pad img_array with (if too small) :return: numpy array in desired shape """ @@ -98,14 +91,14 @@ def reshape_image( def generate_window_polygon(datasource: DatasetReader, window: Window) -> Polygon: - """Turns the spatial bounds of a given window to a shapely.Polygon object - in a given dataset's CRS. + """Turns the spatial bounds of a given window to a shapely.Polygon object in a given + dataset's CRS. - :param datasource: a rasterio DatasetReader object that provides the - affine transformation + :param datasource: a rasterio DatasetReader object that provides the affine + transformation :param window: bounds to represent as Polygon - :return: shapely Polygon representing the spatial bounds of a given - window in a given CRS + :return: shapely Polygon representing the spatial bounds of a given window in a + given CRS """ window_transform = datasource.window_transform(window) @@ -117,8 +110,7 @@ def generate_window_polygon(datasource: DatasetReader, window: Window) -> Polygo def generate_window_offsets(window: Window, schema: WindowSchema) -> np.ndarray: """Computes an array of window offsets bound by a given window. - :param window: the bounding window (i.e. offsets will be within its - bounds) + :param window: the bounding window (i.e. offsets will be within its bounds) :param schema: the parameters for the window generator :return: an array of window offsets within the bounds of window """ @@ -143,14 +135,14 @@ def generate_window_offsets(window: Window, schema: WindowSchema) -> np.ndarray: def window_factory( parent_window: Window, schema: WindowSchema, boundless: bool = True ) -> Generator[Window, None, None]: - """Generator that produces rasterio.Window objects in predetermined steps, - within the given Window. + """Generator that produces rasterio.Window objects in predetermined steps, within + the given Window. - :param parent_window: the window that provides the bounds for all - child_window objects + :param parent_window: the window that provides the bounds for all child_window + objects :param schema: the parameters that determine the window steps - :param boundless: whether the child_window should be clipped by the - parent_window or not + :param boundless: whether the child_window should be clipped by the parent_window or + not :yield: a rasterio.Window used for windowed reading/writing """ @@ -174,8 +166,7 @@ def estimate_average_bounds( ) -> Tuple[float, float]: """Estimates the average size of all features in a GeoDataFrame. - :param gdf: GeoDataFrame that contains all features (i.e. - shapely.Geometry objects) + :param gdf: GeoDataFrame that contains all features (i.e. shapely.Geometry objects) :param quantile: what quantile will represent the feature population :return: a tuple of floats representing average width and height """ @@ -195,19 +186,16 @@ def estimate_schema( quantile: float = 0.9, window_bounds: List[Tuple[int, int]] = [(256, 256), (512, 512)], ) -> WindowSchema: - """Attempts to find a schema that is able to represent the average - GeoDataFrame feature (i.e. sufficient overlap) but within the bounds given - by window_bounds. - - :param gdf: GeoDataFrame that contains features that determine the - degree of overlap - :param src: The rasterio DataSource associated with the resulting - schema (i.e. bounds and pixelsizes) + """Attempts to find a schema that is able to represent the average GeoDataFrame + feature (i.e. sufficient overlap) but within the bounds given by window_bounds. + + :param gdf: GeoDataFrame that contains features that determine the degree of overlap + :param src: The rasterio DataSource associated with the resulting schema (i.e. + bounds and pixelsizes) :param quantile: what quantile will represent the feature population - :param window_bounds: a list of possible limits for the window - generators - :return: (if found) a viable WindowSchema with sufficient overlap - within the window_bounds + :param window_bounds: a list of possible limits for the window generators + :return: (if found) a viable WindowSchema with sufficient overlap within the + window_bounds """ # estimating the required overlap between windows for labels to be represented fully @@ -243,10 +231,11 @@ def estimate_schema( return schema -def assert_valid_categories(categories: np.ndarray, max_dtype: str = " np.ndarray: - """ - Checks if all elements in categories array can be represented by strings - of a certain length (defaults to np.ndarray: + """Checks if all elements in categories array can be represented by strings of a + certain length (defaults to str_categories = categories.astype(str) except Exception as e: raise ValueError("Category values need to be castable to str") from e - + # checking if categories can be castable to str of a certain length (e.g. gpd.GeoDataFrame: polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)] labels = gpd.GeoDataFrame( - geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs + geometry=polygons, + data={"category_id": classes, "class_names": class_names}, + crs=crs, ) # type: ignore return labels @@ -114,7 +116,9 @@ def nonoverlapping_labels() -> gpd.GeoDataFrame: polygons = [p.buffer(distance=buffers[i]) for i, p in enumerate(points)] labels = gpd.GeoDataFrame( - geometry=polygons, data={"category_id": classes, "class_names": class_names}, crs=crs + geometry=polygons, + data={"category_id": classes, "class_names": class_names}, + crs=crs, ) # type: ignore return labels diff --git a/tests/test_coco_models.py b/tests/test_coco_models.py index 12aa3e7..a7c8e55 100644 --- a/tests/test_coco_models.py +++ b/tests/test_coco_models.py @@ -2,8 +2,6 @@ import numpy as np import pathlib from datetime import datetime -import geopandas as gpd -import pytest from geococo.coco_models import ( CocoDataset, Info, @@ -62,7 +60,7 @@ def test_dataset_add_images(): dataset = CocoDataset(info=Info()) assert dataset.next_annotation_id == 1 assert dataset.next_image_id == 1 - + n_images = np.random.randint(2, 10) for _ in range(n_images): @@ -78,6 +76,7 @@ def test_dataset_add_images(): assert n_images == dataset.next_image_id - 1 assert n_images == len(dataset.images) + def test_info(): """Simple instance test.""" @@ -132,9 +131,9 @@ def test_source(): def test_dataset_add_sources(): - """Checks proper incrementation of source_id""" + """Checks proper incrementation of source_id.""" - # Bit different from the other ids since we check for duplication + # Bit different from the other ids since we check for duplication # and only increment if new dataset = CocoDataset(info=Info()) assert dataset.next_source_id == 0 @@ -147,11 +146,11 @@ def test_dataset_add_sources(): def test_dataset_versions(): - """Checks proper incrementation of dataset versions""" + """Checks proper incrementation of dataset versions.""" dataset = CocoDataset(info=Info()) assert dataset.info.version == "0.0.0" - + # minor bump if same output_dir but different raster_source dataset.add_source(source_path=pathlib.Path("a")) assert dataset.info.version == "0.1.0" @@ -164,13 +163,14 @@ def test_dataset_versions(): dataset.verify_new_output_dir(images_dir=pathlib.Path("b")) assert dataset.info.version == "1.0.0" + def test_add_categories(): - """Checks independent mapping of category_attribute to class_ids""" + """Checks independent mapping of category_attribute to class_ids.""" dataset = CocoDataset(info=Info()) assert dataset.categories == [] assert dataset._category_mapper == {} - + # adding three unique classes categories = np.array(["A", "B", "B", "E", "E"]) dataset.add_categories(categories=categories) @@ -179,12 +179,16 @@ def test_add_categories(): assert np.unique(categories).size == len(dataset._category_mapper) assert np.unique(categories).size == len(dataset.categories) assert np.all(np.diff(list(dataset._category_mapper.values())) == 1) - - #check if existing key value pairs don't change + + # check if existing key value pairs don't change # done by adding a bunch of existing classes and one new one initial_mapper = dataset._category_mapper.copy() categories = np.array(["One", "Two", "Two", "Five", "Five", "Six", "Six"]) dataset.add_categories(categories=categories) - subset_mapper = {key: value for key, value in dataset._category_mapper.items() if key in initial_mapper.keys()} + subset_mapper = { + key: value + for key, value in dataset._category_mapper.items() + if key in initial_mapper.keys() + } assert initial_mapper == subset_mapper - assert np.all(np.diff(list(dataset._category_mapper.values())) == 1) \ No newline at end of file + assert np.all(np.diff(list(dataset._category_mapper.values())) == 1) diff --git a/tests/test_utils.py b/tests/test_utils.py index 3cf79d4..9198e35 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,7 +12,7 @@ estimate_average_bounds, estimate_schema, mask_label, - assert_valid_categories + assert_valid_categories, ) from geococo.window_schema import WindowSchema import geopandas as gpd @@ -57,7 +57,7 @@ def test_window_intersect( overlapping_labels: gpd.GeoDataFrame, test_raster: pathlib.Path ) -> None: with rasterio.open(test_raster) as raster_source: - # outer polygons have diameter of 2 so resulting window + # outer polygons have diameter of 2 so resulting window # is 82x82 and offset is 9 (10-2/2) window = window_intersect( input_raster=raster_source, input_vector=overlapping_labels @@ -284,27 +284,31 @@ def test_window_factory_boundless() -> None: assert np.any(window_extents[:, 1] >= window.height) - def test_assert_valid_categories() -> None: - # almost all python objects can be represented by str so we just try casting and verify char length + # almost all python objects can be represented by str + # so we just try casting and verify char length category_lengths = [10, 49, 50] - random_words = np.array(["".join(np.random.choice(list(ascii_lowercase), cl)) for cl in category_lengths]) - + random_words = np.array( + [ + "".join(np.random.choice(list(ascii_lowercase), cl)) + for cl in category_lengths + ] + ) + _ = assert_valid_categories(random_words) - # float64 + # float64 random_numbers = np.random.randn(3).astype(np.float64) _ = assert_valid_categories(random_numbers) # longer than