Add json schemas

neuroinformatics-unit · Jan 21, 2025 · 9bdcffd · 9bdcffd
1 parent 49fc060
commit 9bdcffd
Show file tree

Hide file tree

Showing 6 changed files with 361 additions and 0 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -6,3 +6,7 @@ recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 recursive-exclude docs *
 recursive-exclude tests *
+
+# Include json schemas
+recursive-include ethology *.json
+recursive-include ethology *.md
diff --git a/ethology/annotations/json_schemas/__init__.py b/ethology/annotations/json_schemas/__init__.py
diff --git a/ethology/annotations/json_schemas/schemas/README.md b/ethology/annotations/json_schemas/schemas/README.md
@@ -0,0 +1,32 @@
+## JSON schemas for manual annotations files.
+
+We use JSON schemas to validate the types of a supported annotation file.
+
+Note that the schema validation only checks the type of a key if that key is present. It does not check for the presence of the keys.
+
+If the meta-schema (under $schema) is not provided, the jsonschema validator uses the the latest released draft of the JSON schema specification.
+
+## VIA schema
+
+The VIA schema corresponds to the format exported by VGG Image Annotator 2.x.y (VIA) for object detection annotations.
+
+Each image under `_via_img_metadata` is indexed using a unique key: FILENAME-FILESIZE. We use "additionalProperties" to allow for any key name, see https://stackoverflow.com/a/69811612/24834957.
+
+The section `_via_image_id_list` contains an ordered list of image keys using a unique key: `FILENAME-FILESIZE`, the position in the list defines the image ID.
+
+The section `_via_attributes` region attributes and file attributes, to display in VIA's UI and to classify the data.
+
+The section `_via_data_format_version` contains the version of the VIA tool used.
+
+
+## COCO schema
+The COCO schema follows the COCO dataset format for object detection, see https://cocodataset.org/#format-data.
+
+Box coordinates are measured from the top left corner of the image, and are 0-indexed.
+### References
+----------
+- https://github.com/python-jsonschema/jsonschema
+- https://json-schema.org/understanding-json-schema/
+- https://cocodataset.org/#format-data
+- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file
+- https://python-jsonschema.readthedocs.io/en/stable/api/#jsonschema.validate
diff --git a/ethology/annotations/json_schemas/schemas/coco_schema.json b/ethology/annotations/json_schemas/schemas/coco_schema.json
@@ -0,0 +1,78 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "info": {
+            "type": "object"
+        },
+        "licenses": {
+            "type": "array"
+        },
+        "images": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "file_name": {
+                        "type": "string"
+                    },
+                    "id": {
+                        "type": "integer"
+                    },
+                    "width": {
+                        "type": "integer"
+                    },
+                    "height": {
+                        "type": "integer"
+                    }
+                }
+            }
+        },
+        "annotations": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "integer"
+                    },
+                    "image_id": {
+                        "type": "integer"
+                    },
+                    "bbox": {
+                        "type": "array",
+                        "items": {
+                            "type": "integer"
+                        }
+                    },
+                    "category_id": {
+                        "type": "integer"
+                    },
+                    "area": {
+                        "type": "number"
+                    },
+                    "iscrowd": {
+                        "type": "integer"
+                    }
+                }
+            }
+        },
+        "categories": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "integer"
+                    },
+                    "name": {
+                        "type": "string"
+                    },
+                    "supercategory": {
+                        "type": "string"
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/ethology/annotations/json_schemas/schemas/via_schema.json b/ethology/annotations/json_schemas/schemas/via_schema.json
@@ -0,0 +1,88 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "_via_settings": {
+            "type": "object",
+            "properties": {
+                "ui": {
+                    "type": "object"
+                },
+                "core": {
+                    "type": "object"
+                },
+                "project": {
+                    "type": "object"
+                }
+            }
+        },
+        "_via_img_metadata": {
+            "type": "object",
+            "additionalProperties": {
+                "type": "object",
+                "properties": {
+                    "filename": {
+                        "type": "string"
+                    },
+                    "size": {
+                        "type": "integer"
+                    },
+                    "regions": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "shape_attributes": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {
+                                            "type": "string"
+                                        },
+                                        "x": {
+                                            "type": "integer"
+                                        },
+                                        "y": {
+                                            "type": "integer"
+                                        },
+                                        "width": {
+                                            "type": "integer"
+                                        },
+                                        "height": {
+                                            "type": "integer"
+                                        }
+                                    }
+                                },
+                                "region_attributes": {
+                                    "type": "object"
+                                }
+                            }
+                        }
+                    },
+                    "file_attributes": {
+                        "type": "object"
+                    }
+                }
+            }
+        },
+        "_via_image_id_list": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            }
+        },
+        "_via_attributes": {
+            "type": "object",
+            "properties": {
+                "region": {
+                    "type": "object"
+                },
+                "file": {
+                    "type": "object"
+                }
+            }
+        },
+        "_via_data_format_version": {
+            "type": "string"
+        }
+    }
+}
diff --git a/ethology/annotations/json_schemas/utils.py b/ethology/annotations/json_schemas/utils.py
@@ -0,0 +1,159 @@
+"""Utility functions for JSON schema files."""
+
+import json
+from pathlib import Path
+
+import jsonschema
+import jsonschema.exceptions
+
+
+def _get_default_VIA_schema() -> dict:
+    """Read a VIA schema file."""
+    via_schema_path = Path(__file__).parent / "schemas" / "via_schema.json"
+    with open(via_schema_path) as file:
+        via_schema_dict = json.load(file)
+    return via_schema_dict
+
+
+def _get_default_COCO_schema() -> dict:
+    """Read a COCO schema file."""
+    coco_schema_path = Path(__file__).parent / "schemas" / "coco_schema.json"
+    with open(coco_schema_path) as file:
+        coco_schema_dict = json.load(file)
+    return coco_schema_dict
+
+
+def _check_file_is_json(filepath: Path):
+    """Ensure that the file is a JSON file."""
+    try:
+        with open(filepath) as file:
+            json.load(file)
+    except FileNotFoundError as not_found_error:
+        raise FileNotFoundError(
+            f"File not found: {filepath}."
+        ) from not_found_error
+    except json.JSONDecodeError as decode_error:
+        raise ValueError(
+            f"Error decoding JSON data from file: {filepath}."
+        ) from decode_error
+
+
+def _check_file_matches_schema(filepath: Path, schema: dict):
+    """Ensure that the JSON file matches the expected schema.
+
+    The schema validation only checks the type for each specified
+    key if the key exists. It does not check that the keys in the
+    schema are present in the JSON file.
+    """
+    # read json file
+    with open(filepath) as file:
+        data = json.load(file)
+
+    # check against schema if provided
+    if schema:
+        try:
+            jsonschema.validate(instance=data, schema=schema)
+        except jsonschema.exceptions.ValidationError as val_err:
+            raise val_err
+        except jsonschema.exceptions.SchemaError as schema_err:
+            raise schema_err
+
+
+def _check_required_properties_keys(
+    required_properties_keys: list, schema: dict
+):
+    """Ensure that the input schema includes the required "properties" keys."""
+    # Get keys of "properties" dictionaries in schema
+    properties_keys_in_schema = _extract_properties_keys(schema)
+
+    # Get list of "properties" keys that are required but not in schema
+    missing_keys = set(required_properties_keys) - set(
+        properties_keys_in_schema
+    )
+
+    # Raise error if there are missing keys in the schema
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {sorted(missing_keys)} not found "
+            "in schema. Note that "
+            "a key may not be found correctly if the schema keywords "
+            "(such as 'properties', 'type' or 'items') are not spelt "
+            "correctly."
+        )
+
+
+def _check_required_keys_in_dict(
+    list_required_keys: list[str],
+    data: dict,
+    additional_message: str = "",
+):
+    """Check if the required keys are present in the input data_dict."""
+    missing_keys = set(list_required_keys) - data.keys()
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {sorted(missing_keys)} not "
+            f"found{additional_message}."
+        )
+
+
+def _extract_properties_keys(schema: dict, parent_key="") -> list:
+    """Recursively extract the keys of all "properties" subdictionaries.
+
+    Recursively extract the keys of all subdictionaries in the input
+    dictionary that are values to a "properties" key. The input dictionary
+    represents a JSON schema dictionary
+    (see https://json-schema.org/understanding-json-schema/about).
+
+    The "properties" key always appears as part of a dictionary with at least
+    another key, that is "type" or "item".
+    """
+    # The "property keys" are either "properties" or "additionalProperties"
+    # as they are the keys with the relevant data
+    property_keys = ["properties", "additionalProperties"]
+
+    def _contains_properties_key(input: dict):
+        """Return True if the input dictionary contains a property key."""
+        return any(x in input for x in property_keys)
+
+    def _get_properties_subdict(input: dict):
+        """Get the subdictionary under the property key."""
+        return input[next(k for k in property_keys if k in input)]
+
+    keys_of_properties_dicts = []
+    if "type" in schema:
+        if _contains_properties_key(schema):
+            # Get the subdictionary under the properties key
+            properties_subdict = _get_properties_subdict(schema)
+
+            # Check if there is a nested "properties" dict inside the current
+            # one. If so, go down one level.
+            if _contains_properties_key(properties_subdict):
+                properties_subdict = _get_properties_subdict(
+                    properties_subdict
+                )
+
+            # Add keys of deepest "properties dict" to list
+            keys_of_properties_dicts.extend(
+                [
+                    f"{parent_key}/{ky}" if parent_key else ky
+                    for ky in properties_subdict
+                ]
+            )
+
+            # Inspect non-properties dictionaries under this properties subdict
+            for ky, val in properties_subdict.items():
+                full_key = f"{parent_key}/{ky}" if parent_key else ky
+                keys_of_properties_dicts.extend(
+                    _extract_properties_keys(val, full_key)
+                )
+
+        elif "items" in schema:
+            # Analyse the dictionary under the "items" key
+            properties_subdict = schema["items"]
+            keys_of_properties_dicts.extend(
+                _extract_properties_keys(
+                    properties_subdict, parent_key=parent_key
+                )
+            )
+
+    return sorted(keys_of_properties_dicts)