Skip to content

Commit

Permalink
Add json schemas
Browse files Browse the repository at this point in the history
  • Loading branch information
sfmig committed Jan 21, 2025
1 parent 49fc060 commit 9bdcffd
Show file tree
Hide file tree
Showing 6 changed files with 361 additions and 0 deletions.
4 changes: 4 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ recursive-exclude * __pycache__
recursive-exclude * *.py[co]
recursive-exclude docs *
recursive-exclude tests *

# Include json schemas
recursive-include ethology *.json
recursive-include ethology *.md
Empty file.
32 changes: 32 additions & 0 deletions ethology/annotations/json_schemas/schemas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
## JSON schemas for manual annotations files.

We use JSON schemas to validate the types of a supported annotation file.

Note that the schema validation only checks the type of a key if that key is present. It does not check for the presence of the keys.

If the meta-schema (under $schema) is not provided, the jsonschema validator uses the the latest released draft of the JSON schema specification.

## VIA schema

The VIA schema corresponds to the format exported by VGG Image Annotator 2.x.y (VIA) for object detection annotations.

Each image under `_via_img_metadata` is indexed using a unique key: FILENAME-FILESIZE. We use "additionalProperties" to allow for any key name, see https://stackoverflow.com/a/69811612/24834957.

The section `_via_image_id_list` contains an ordered list of image keys using a unique key: `FILENAME-FILESIZE`, the position in the list defines the image ID.

The section `_via_attributes` region attributes and file attributes, to display in VIA's UI and to classify the data.

The section `_via_data_format_version` contains the version of the VIA tool used.


## COCO schema
The COCO schema follows the COCO dataset format for object detection, see https://cocodataset.org/#format-data.

Box coordinates are measured from the top left corner of the image, and are 0-indexed.
### References
----------
- https://github.com/python-jsonschema/jsonschema
- https://json-schema.org/understanding-json-schema/
- https://cocodataset.org/#format-data
- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file
- https://python-jsonschema.readthedocs.io/en/stable/api/#jsonschema.validate
78 changes: 78 additions & 0 deletions ethology/annotations/json_schemas/schemas/coco_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"properties": {
"info": {
"type": "object"
},
"licenses": {
"type": "array"
},
"images": {
"type": "array",
"items": {
"type": "object",
"properties": {
"file_name": {
"type": "string"
},
"id": {
"type": "integer"
},
"width": {
"type": "integer"
},
"height": {
"type": "integer"
}
}
}
},
"annotations": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {
"type": "integer"
},
"image_id": {
"type": "integer"
},
"bbox": {
"type": "array",
"items": {
"type": "integer"
}
},
"category_id": {
"type": "integer"
},
"area": {
"type": "number"
},
"iscrowd": {
"type": "integer"
}
}
}
},
"categories": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {
"type": "integer"
},
"name": {
"type": "string"
},
"supercategory": {
"type": "string"
}
}
}
}
}
}
88 changes: 88 additions & 0 deletions ethology/annotations/json_schemas/schemas/via_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"properties": {
"_via_settings": {
"type": "object",
"properties": {
"ui": {
"type": "object"
},
"core": {
"type": "object"
},
"project": {
"type": "object"
}
}
},
"_via_img_metadata": {
"type": "object",
"additionalProperties": {
"type": "object",
"properties": {
"filename": {
"type": "string"
},
"size": {
"type": "integer"
},
"regions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"shape_attributes": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"x": {
"type": "integer"
},
"y": {
"type": "integer"
},
"width": {
"type": "integer"
},
"height": {
"type": "integer"
}
}
},
"region_attributes": {
"type": "object"
}
}
}
},
"file_attributes": {
"type": "object"
}
}
}
},
"_via_image_id_list": {
"type": "array",
"items": {
"type": "string"
}
},
"_via_attributes": {
"type": "object",
"properties": {
"region": {
"type": "object"
},
"file": {
"type": "object"
}
}
},
"_via_data_format_version": {
"type": "string"
}
}
}
159 changes: 159 additions & 0 deletions ethology/annotations/json_schemas/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""Utility functions for JSON schema files."""

import json
from pathlib import Path

import jsonschema
import jsonschema.exceptions


def _get_default_VIA_schema() -> dict:
"""Read a VIA schema file."""
via_schema_path = Path(__file__).parent / "schemas" / "via_schema.json"
with open(via_schema_path) as file:
via_schema_dict = json.load(file)
return via_schema_dict


def _get_default_COCO_schema() -> dict:
"""Read a COCO schema file."""
coco_schema_path = Path(__file__).parent / "schemas" / "coco_schema.json"
with open(coco_schema_path) as file:
coco_schema_dict = json.load(file)
return coco_schema_dict


def _check_file_is_json(filepath: Path):
"""Ensure that the file is a JSON file."""
try:
with open(filepath) as file:
json.load(file)
except FileNotFoundError as not_found_error:
raise FileNotFoundError(
f"File not found: {filepath}."
) from not_found_error
except json.JSONDecodeError as decode_error:
raise ValueError(
f"Error decoding JSON data from file: {filepath}."
) from decode_error


def _check_file_matches_schema(filepath: Path, schema: dict):
"""Ensure that the JSON file matches the expected schema.
The schema validation only checks the type for each specified
key if the key exists. It does not check that the keys in the
schema are present in the JSON file.
"""
# read json file
with open(filepath) as file:
data = json.load(file)

# check against schema if provided
if schema:
try:
jsonschema.validate(instance=data, schema=schema)
except jsonschema.exceptions.ValidationError as val_err:
raise val_err
except jsonschema.exceptions.SchemaError as schema_err:
raise schema_err


def _check_required_properties_keys(
required_properties_keys: list, schema: dict
):
"""Ensure that the input schema includes the required "properties" keys."""
# Get keys of "properties" dictionaries in schema
properties_keys_in_schema = _extract_properties_keys(schema)

# Get list of "properties" keys that are required but not in schema
missing_keys = set(required_properties_keys) - set(
properties_keys_in_schema
)

# Raise error if there are missing keys in the schema
if missing_keys:
raise ValueError(
f"Required key(s) {sorted(missing_keys)} not found "
"in schema. Note that "
"a key may not be found correctly if the schema keywords "
"(such as 'properties', 'type' or 'items') are not spelt "
"correctly."
)


def _check_required_keys_in_dict(
list_required_keys: list[str],
data: dict,
additional_message: str = "",
):
"""Check if the required keys are present in the input data_dict."""
missing_keys = set(list_required_keys) - data.keys()
if missing_keys:
raise ValueError(
f"Required key(s) {sorted(missing_keys)} not "
f"found{additional_message}."
)


def _extract_properties_keys(schema: dict, parent_key="") -> list:
"""Recursively extract the keys of all "properties" subdictionaries.
Recursively extract the keys of all subdictionaries in the input
dictionary that are values to a "properties" key. The input dictionary
represents a JSON schema dictionary
(see https://json-schema.org/understanding-json-schema/about).
The "properties" key always appears as part of a dictionary with at least
another key, that is "type" or "item".
"""
# The "property keys" are either "properties" or "additionalProperties"
# as they are the keys with the relevant data
property_keys = ["properties", "additionalProperties"]

def _contains_properties_key(input: dict):
"""Return True if the input dictionary contains a property key."""
return any(x in input for x in property_keys)

def _get_properties_subdict(input: dict):
"""Get the subdictionary under the property key."""
return input[next(k for k in property_keys if k in input)]

keys_of_properties_dicts = []
if "type" in schema:
if _contains_properties_key(schema):
# Get the subdictionary under the properties key
properties_subdict = _get_properties_subdict(schema)

# Check if there is a nested "properties" dict inside the current
# one. If so, go down one level.
if _contains_properties_key(properties_subdict):
properties_subdict = _get_properties_subdict(
properties_subdict
)

# Add keys of deepest "properties dict" to list
keys_of_properties_dicts.extend(
[
f"{parent_key}/{ky}" if parent_key else ky
for ky in properties_subdict
]
)

# Inspect non-properties dictionaries under this properties subdict
for ky, val in properties_subdict.items():
full_key = f"{parent_key}/{ky}" if parent_key else ky
keys_of_properties_dicts.extend(
_extract_properties_keys(val, full_key)
)

elif "items" in schema:
# Analyse the dictionary under the "items" key
properties_subdict = schema["items"]
keys_of_properties_dicts.extend(
_extract_properties_keys(
properties_subdict, parent_key=parent_key
)
)

return sorted(keys_of_properties_dicts)

0 comments on commit 9bdcffd

Please sign in to comment.