-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add validators for bboxes annotation files (#32)
* Add json schemas * Add validators for VIA and COCO files * Update MANIFEST * Add tests for supported validators * Combine validators tests * Simplify JSON check * Delete placeholder * Rename schemas * Small edits caps * Update docstrings * Incorporate Niko's comments * Remove spell check suggestion from error message * Combine get_default_schema into 1 * Clarify init=False * Missing verb * log_message --> error_message * A simplified version of _extract_properties_keys based on Niko's
- Loading branch information
Showing
9 changed files
with
933 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
78 changes: 78 additions & 0 deletions
78
ethology/annotations/json_schemas/schemas/COCO_schema.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
{ | ||
"$schema": "https://json-schema.org/draft/2020-12/schema", | ||
"type": "object", | ||
"properties": { | ||
"info": { | ||
"type": "object" | ||
}, | ||
"licenses": { | ||
"type": "array" | ||
}, | ||
"images": { | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"file_name": { | ||
"type": "string" | ||
}, | ||
"id": { | ||
"type": "integer" | ||
}, | ||
"width": { | ||
"type": "integer" | ||
}, | ||
"height": { | ||
"type": "integer" | ||
} | ||
} | ||
} | ||
}, | ||
"annotations": { | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"id": { | ||
"type": "integer" | ||
}, | ||
"image_id": { | ||
"type": "integer" | ||
}, | ||
"bbox": { | ||
"type": "array", | ||
"items": { | ||
"type": "integer" | ||
} | ||
}, | ||
"category_id": { | ||
"type": "integer" | ||
}, | ||
"area": { | ||
"type": "number" | ||
}, | ||
"iscrowd": { | ||
"type": "integer" | ||
} | ||
} | ||
} | ||
}, | ||
"categories": { | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"id": { | ||
"type": "integer" | ||
}, | ||
"name": { | ||
"type": "string" | ||
}, | ||
"supercategory": { | ||
"type": "string" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
## JSON schemas for manual annotations files. | ||
|
||
We use JSON schemas to validate the types of a supported annotation file. | ||
|
||
Note that the schema validation only checks the type of a key if that key is present. It does not check for the presence of the keys. | ||
|
||
If the meta-schema (under $schema) is not provided, the jsonschema validator uses the the latest released draft of the JSON schema specification. | ||
|
||
## VIA schema | ||
|
||
The VIA schema corresponds to the format exported by VGG Image Annotator 2.x.y (VIA) for object detection annotations. | ||
|
||
Each image under `_via_img_metadata` is indexed using a unique key: FILENAME-FILESIZE. We use "additionalProperties" to allow for any key name, see https://stackoverflow.com/a/69811612/24834957. | ||
|
||
The section `_via_image_id_list` contains an ordered list of image keys using a unique key: `FILENAME-FILESIZE`, the position in the list defines the image ID. | ||
|
||
The section `_via_attributes` contains region attributes and file attributes, to display in VIA's UI and to classify the data. | ||
|
||
The section `_via_data_format_version` contains the version of the VIA tool used. | ||
|
||
|
||
## COCO schema | ||
The COCO schema follows the COCO dataset format for object detection, see https://cocodataset.org/#format-data. | ||
|
||
Box coordinates are measured from the top left corner of the image, and are 0-indexed. | ||
### References | ||
---------- | ||
- https://github.com/python-jsonschema/jsonschema | ||
- https://json-schema.org/understanding-json-schema/ | ||
- https://cocodataset.org/#format-data | ||
- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file | ||
- https://python-jsonschema.readthedocs.io/en/stable/api/#jsonschema.validate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
{ | ||
"$schema": "https://json-schema.org/draft/2020-12/schema", | ||
"type": "object", | ||
"properties": { | ||
"_via_settings": { | ||
"type": "object", | ||
"properties": { | ||
"ui": { | ||
"type": "object" | ||
}, | ||
"core": { | ||
"type": "object" | ||
}, | ||
"project": { | ||
"type": "object" | ||
} | ||
} | ||
}, | ||
"_via_img_metadata": { | ||
"type": "object", | ||
"additionalProperties": { | ||
"type": "object", | ||
"properties": { | ||
"filename": { | ||
"type": "string" | ||
}, | ||
"size": { | ||
"type": "integer" | ||
}, | ||
"regions": { | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"shape_attributes": { | ||
"type": "object", | ||
"properties": { | ||
"name": { | ||
"type": "string" | ||
}, | ||
"x": { | ||
"type": "integer" | ||
}, | ||
"y": { | ||
"type": "integer" | ||
}, | ||
"width": { | ||
"type": "integer" | ||
}, | ||
"height": { | ||
"type": "integer" | ||
} | ||
} | ||
}, | ||
"region_attributes": { | ||
"type": "object" | ||
} | ||
} | ||
} | ||
}, | ||
"file_attributes": { | ||
"type": "object" | ||
} | ||
} | ||
} | ||
}, | ||
"_via_image_id_list": { | ||
"type": "array", | ||
"items": { | ||
"type": "string" | ||
} | ||
}, | ||
"_via_attributes": { | ||
"type": "object", | ||
"properties": { | ||
"region": { | ||
"type": "object" | ||
}, | ||
"file": { | ||
"type": "object" | ||
} | ||
} | ||
}, | ||
"_via_data_format_version": { | ||
"type": "string" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
"""Utility functions for JSON schema files.""" | ||
|
||
import json | ||
from pathlib import Path | ||
|
||
import jsonschema | ||
|
||
|
||
def _get_default_schema(schema_name: str) -> dict: | ||
"""Get the default VIA or COCO schema as a dictionary.""" | ||
schema_path = ( | ||
Path(__file__).parent / "schemas" / f"{schema_name}_schema.json" | ||
) | ||
with open(schema_path) as file: | ||
schema_dict = json.load(file) | ||
return schema_dict | ||
|
||
|
||
def _check_file_is_json(filepath: Path): | ||
"""Check the input file can be read as a JSON.""" | ||
try: | ||
with open(filepath) as file: | ||
json.load(file) | ||
except json.JSONDecodeError as decode_error: | ||
# We override the error message for clarity | ||
raise ValueError( | ||
f"Error decoding JSON data from file: {filepath}. " | ||
"The data being deserialized is not a valid JSON. " | ||
) from decode_error | ||
|
||
|
||
def _check_file_matches_schema(filepath: Path, schema: dict | None): | ||
"""Check the input JSON file matches the given schema. | ||
The schema validation only checks the type for each specified | ||
key if the key exists. It does not check that the keys in the | ||
schema are present in the JSON file. | ||
""" | ||
# Read json file | ||
with open(filepath) as file: | ||
data = json.load(file) | ||
|
||
# Check against schema if provided | ||
if schema: | ||
jsonschema.validate(instance=data, schema=schema) | ||
|
||
|
||
def _check_required_properties_keys( | ||
required_properties_keys: list, schema: dict | ||
): | ||
"""Check the input schema includes the required "properties" keys.""" | ||
# Get keys of "properties" dictionaries in schema | ||
properties_keys_in_schema = _extract_properties_keys(schema) | ||
|
||
# Get list of "properties" keys that are required but not in schema | ||
missing_keys = set(required_properties_keys) - set( | ||
properties_keys_in_schema | ||
) | ||
|
||
# Raise error if there are missing keys in the schema | ||
if missing_keys: | ||
raise ValueError( | ||
f"Required key(s) {sorted(missing_keys)} not found " "in schema." | ||
) | ||
|
||
|
||
def _check_required_keys_in_dict( | ||
list_required_keys: list[str], | ||
data: dict, | ||
additional_message: str = "", | ||
): | ||
"""Check if the required keys are present in the input dictionary.""" | ||
missing_keys = set(list_required_keys) - set(data.keys()) | ||
if missing_keys: | ||
raise ValueError( | ||
f"Required key(s) {sorted(missing_keys)} not " | ||
f"found{additional_message}." | ||
) | ||
|
||
|
||
def _extract_properties_keys(input_schema: dict, prefix: str = "") -> list: | ||
"""Extract keys from all "properties" subdictionaries in a JSON schema. | ||
Recursively extract the keys of all subdictionaries in the input | ||
dictionary that are values to a "properties" key. The input dictionary | ||
represents a JSON schema dictionary | ||
(see https://json-schema.org/understanding-json-schema/about). The output | ||
is a sorted list of strings with full paths (e.g. 'parent/child'). | ||
The "properties" key always appears as part of a set of dictionary keys | ||
with at least another key being "type" or "item". We use this to find the | ||
relevant subdictionaries. | ||
""" | ||
result: list[str] = [] | ||
|
||
# Skip if "type" key is missing in the schema | ||
if "type" not in input_schema: | ||
return result | ||
|
||
# If the input dictionary has a "properties" key: extract keys | ||
# and recurse into nested dictionaries | ||
if "properties" in input_schema: | ||
for key, value in input_schema["properties"].items(): | ||
full_key = f"{prefix}/{key}" if prefix else key | ||
result.append(full_key) | ||
# Recurse into nested dictionaries to look for more "properties" | ||
# dicts | ||
result.extend(_extract_properties_keys(value, full_key)) | ||
|
||
# If dictionary has "additionalProperties" key: recurse into it | ||
if "additionalProperties" in input_schema: | ||
result.extend( | ||
_extract_properties_keys( | ||
input_schema["additionalProperties"], | ||
prefix, | ||
) | ||
) | ||
|
||
# If dictionary has "items" key: recurse into it | ||
if "items" in input_schema: | ||
result.extend( | ||
_extract_properties_keys( | ||
input_schema["items"], | ||
prefix, | ||
) | ||
) | ||
|
||
# Return sorted list of keys with full paths | ||
return sorted(result) |
Oops, something went wrong.