Add validators for bboxes annotation files (#32)

* Add json schemas * Add validators for VIA and COCO files * Update MANIFEST * Add tests for supported validators * Combine validators tests * Simplify JSON check * Delete placeholder * Rename schemas * Small edits caps * Update docstrings * Incorporate Niko's comments * Remove spell check suggestion from error message * Combine get_default_schema into 1 * Clarify init=False * Missing verb * log_message --> error_message * A simplified version of _extract_properties_keys based on Niko's
neuroinformatics-unit · Jan 29, 2025 · 3d0a1ba · 3d0a1ba
1 parent c7e2f7d
commit 3d0a1ba
Show file tree

Hide file tree

Showing 9 changed files with 933 additions and 2 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -6,3 +6,7 @@ recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 recursive-exclude docs *
 recursive-exclude tests *
+
+# Include json schemas
+recursive-include ethology/annotations/json_schemas/schemas *.json
+recursive-include ethology/annotations/json_schemas/schemas *.md
diff --git a/ethology/annotations/json_schemas/__init__.py b/ethology/annotations/json_schemas/__init__.py
diff --git a/ethology/annotations/json_schemas/schemas/COCO_schema.json b/ethology/annotations/json_schemas/schemas/COCO_schema.json
@@ -0,0 +1,78 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "info": {
+            "type": "object"
+        },
+        "licenses": {
+            "type": "array"
+        },
+        "images": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "file_name": {
+                        "type": "string"
+                    },
+                    "id": {
+                        "type": "integer"
+                    },
+                    "width": {
+                        "type": "integer"
+                    },
+                    "height": {
+                        "type": "integer"
+                    }
+                }
+            }
+        },
+        "annotations": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "integer"
+                    },
+                    "image_id": {
+                        "type": "integer"
+                    },
+                    "bbox": {
+                        "type": "array",
+                        "items": {
+                            "type": "integer"
+                        }
+                    },
+                    "category_id": {
+                        "type": "integer"
+                    },
+                    "area": {
+                        "type": "number"
+                    },
+                    "iscrowd": {
+                        "type": "integer"
+                    }
+                }
+            }
+        },
+        "categories": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "integer"
+                    },
+                    "name": {
+                        "type": "string"
+                    },
+                    "supercategory": {
+                        "type": "string"
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/ethology/annotations/json_schemas/schemas/README.md b/ethology/annotations/json_schemas/schemas/README.md
@@ -0,0 +1,32 @@
+## JSON schemas for manual annotations files.
+
+We use JSON schemas to validate the types of a supported annotation file.
+
+Note that the schema validation only checks the type of a key if that key is present. It does not check for the presence of the keys.
+
+If the meta-schema (under $schema) is not provided, the jsonschema validator uses the the latest released draft of the JSON schema specification.
+
+## VIA schema
+
+The VIA schema corresponds to the format exported by VGG Image Annotator 2.x.y (VIA) for object detection annotations.
+
+Each image under `_via_img_metadata` is indexed using a unique key: FILENAME-FILESIZE. We use "additionalProperties" to allow for any key name, see https://stackoverflow.com/a/69811612/24834957.
+
+The section `_via_image_id_list` contains an ordered list of image keys using a unique key: `FILENAME-FILESIZE`, the position in the list defines the image ID.
+
+The section `_via_attributes` contains region attributes and file attributes, to display in VIA's UI and to classify the data.
+
+The section `_via_data_format_version` contains the version of the VIA tool used.
+
+
+## COCO schema
+The COCO schema follows the COCO dataset format for object detection, see https://cocodataset.org/#format-data.
+
+Box coordinates are measured from the top left corner of the image, and are 0-indexed.
+### References
+----------
+- https://github.com/python-jsonschema/jsonschema
+- https://json-schema.org/understanding-json-schema/
+- https://cocodataset.org/#format-data
+- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file
+- https://python-jsonschema.readthedocs.io/en/stable/api/#jsonschema.validate
diff --git a/ethology/annotations/json_schemas/schemas/VIA_schema.json b/ethology/annotations/json_schemas/schemas/VIA_schema.json
@@ -0,0 +1,88 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "_via_settings": {
+            "type": "object",
+            "properties": {
+                "ui": {
+                    "type": "object"
+                },
+                "core": {
+                    "type": "object"
+                },
+                "project": {
+                    "type": "object"
+                }
+            }
+        },
+        "_via_img_metadata": {
+            "type": "object",
+            "additionalProperties": {
+                "type": "object",
+                "properties": {
+                    "filename": {
+                        "type": "string"
+                    },
+                    "size": {
+                        "type": "integer"
+                    },
+                    "regions": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "shape_attributes": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {
+                                            "type": "string"
+                                        },
+                                        "x": {
+                                            "type": "integer"
+                                        },
+                                        "y": {
+                                            "type": "integer"
+                                        },
+                                        "width": {
+                                            "type": "integer"
+                                        },
+                                        "height": {
+                                            "type": "integer"
+                                        }
+                                    }
+                                },
+                                "region_attributes": {
+                                    "type": "object"
+                                }
+                            }
+                        }
+                    },
+                    "file_attributes": {
+                        "type": "object"
+                    }
+                }
+            }
+        },
+        "_via_image_id_list": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            }
+        },
+        "_via_attributes": {
+            "type": "object",
+            "properties": {
+                "region": {
+                    "type": "object"
+                },
+                "file": {
+                    "type": "object"
+                }
+            }
+        },
+        "_via_data_format_version": {
+            "type": "string"
+        }
+    }
+}
diff --git a/ethology/annotations/json_schemas/utils.py b/ethology/annotations/json_schemas/utils.py
@@ -0,0 +1,130 @@
+"""Utility functions for JSON schema files."""
+
+import json
+from pathlib import Path
+
+import jsonschema
+
+
+def _get_default_schema(schema_name: str) -> dict:
+    """Get the default VIA or COCO schema as a dictionary."""
+    schema_path = (
+        Path(__file__).parent / "schemas" / f"{schema_name}_schema.json"
+    )
+    with open(schema_path) as file:
+        schema_dict = json.load(file)
+    return schema_dict
+
+
+def _check_file_is_json(filepath: Path):
+    """Check the input file can be read as a JSON."""
+    try:
+        with open(filepath) as file:
+            json.load(file)
+    except json.JSONDecodeError as decode_error:
+        # We override the error message for clarity
+        raise ValueError(
+            f"Error decoding JSON data from file: {filepath}. "
+            "The data being deserialized is not a valid JSON. "
+        ) from decode_error
+
+
+def _check_file_matches_schema(filepath: Path, schema: dict | None):
+    """Check the input JSON file matches the given schema.
+
+    The schema validation only checks the type for each specified
+    key if the key exists. It does not check that the keys in the
+    schema are present in the JSON file.
+    """
+    # Read json file
+    with open(filepath) as file:
+        data = json.load(file)
+
+    # Check against schema if provided
+    if schema:
+        jsonschema.validate(instance=data, schema=schema)
+
+
+def _check_required_properties_keys(
+    required_properties_keys: list, schema: dict
+):
+    """Check the input schema includes the required "properties" keys."""
+    # Get keys of "properties" dictionaries in schema
+    properties_keys_in_schema = _extract_properties_keys(schema)
+
+    # Get list of "properties" keys that are required but not in schema
+    missing_keys = set(required_properties_keys) - set(
+        properties_keys_in_schema
+    )
+
+    # Raise error if there are missing keys in the schema
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {sorted(missing_keys)} not found " "in schema."
+        )
+
+
+def _check_required_keys_in_dict(
+    list_required_keys: list[str],
+    data: dict,
+    additional_message: str = "",
+):
+    """Check if the required keys are present in the input dictionary."""
+    missing_keys = set(list_required_keys) - set(data.keys())
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {sorted(missing_keys)} not "
+            f"found{additional_message}."
+        )
+
+
+def _extract_properties_keys(input_schema: dict, prefix: str = "") -> list:
+    """Extract keys from all "properties" subdictionaries in a JSON schema.
+
+    Recursively extract the keys of all subdictionaries in the input
+    dictionary that are values to a "properties" key. The input dictionary
+    represents a JSON schema dictionary
+    (see https://json-schema.org/understanding-json-schema/about). The output
+    is a sorted list of strings with full paths (e.g. 'parent/child').
+
+    The "properties" key always appears as part of a set of dictionary keys
+    with at least another key being "type" or "item". We use this to find the
+    relevant subdictionaries.
+
+    """
+    result: list[str] = []
+
+    # Skip if "type" key is missing in the schema
+    if "type" not in input_schema:
+        return result
+
+    # If the input dictionary has a "properties" key: extract keys
+    # and recurse into nested dictionaries
+    if "properties" in input_schema:
+        for key, value in input_schema["properties"].items():
+            full_key = f"{prefix}/{key}" if prefix else key
+            result.append(full_key)
+            # Recurse into nested dictionaries to look for more "properties"
+            # dicts
+            result.extend(_extract_properties_keys(value, full_key))
+
+    # If dictionary has "additionalProperties" key: recurse into it
+    if "additionalProperties" in input_schema:
+        result.extend(
+            _extract_properties_keys(
+                input_schema["additionalProperties"],
+                prefix,
+            )
+        )
+
+    # If dictionary has "items" key: recurse into it
+    if "items" in input_schema:
+        result.extend(
+            _extract_properties_keys(
+                input_schema["items"],
+                prefix,
+            )
+        )
+
+    # Return sorted list of keys with full paths
+    return sorted(result)